diff --git "a/logs/logs/xp3capmixnewcodelonglossseq/main_log.txt" "b/logs/logs/xp3capmixnewcodelonglossseq/main_log.txt" new file mode 100644--- /dev/null +++ "b/logs/logs/xp3capmixnewcodelonglossseq/main_log.txt" @@ -0,0 +1,33719 @@ +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:using world size: 16, data-parallel-size: 16, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 16 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2075085.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 10 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 4096 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 1024 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 1024 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/kill-switch-tr13e-350M-mtf +[default0]: kv_channels ..................................... 64 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 1 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 1 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/tr13e-350M-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whit[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +espace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 16 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 64 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-06 12:23:21,890] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default3]:> setting tensorboard ... +[default0]:> initializing tensor model parallel with size 1 +[default0]:> initializing pipeline model parallel with size 1 +[default0]:> setting random seeds to 42 ... +[default0]:[2022-10-06 12:23:23,241] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 +[default0]:> compiling dataset index builder ... +[default0]:make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/data' +[default0]:make: Nothing to be done for 'default'. +[default0]:make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/data' +[default0]:>>> done with dataset index builder. Compilation time: 0.073 seconds +[default0]:> compiling and loading fused kernels ... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module fused_mix_prec_layer_norm_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module fused_mix_prec_layer_norm_cuda... +[default0]:>>> done with compiling and loading fused kernels. Compilation time: 3.747 seconds +[default0]:time to initialize megatron (seconds): 31.076 +[default0]:[after megatron is initialized] datetime: 2022-10-06 12:23:27 +[default0]:building GPT model ... +[default0]:[2022-10-06 12:23:27,108] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2022-10-06 12:23:27,108] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +[default0]:[2022-10-06 12:23:27,108] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 28.05 GB, percent = 5.6% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15} +[default0]:[2022-10-06 12:23:27,571] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer|embedding +[default0]:stage=0 layers=31 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: ParallelTransformerLayerPipe +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: undo +[default0]: 28: MixedFusedLayerNorm +[default0]: 29: EmbeddingPipe +[default0]: 30: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2022-10-06 12:23:27,649] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2022-10-06 12:23:27,649] [INFO] [utils.py:828:see_memory_usage] MA 1.04 GB Max_MA 1.04 GB CA 1.09 GB Max_CA 1 GB +[default0]:[2022-10-06 12:23:27,649] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 28.09 GB, percent = 5.6% +[default0]:setting training iterations to 6200 +[default0]:> learning rate decay style: constant +[default0]:DeepSpeed is enabled. +[default0]:[2022-10-06 12:23:27,651] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.1+8b2a6371, git-hash=8b2a6371, git-branch=master +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer +[default2]: model, optimizer, _, lr_scheduler = deepspeed.initialize( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize +[default2]: engine = PipelineEngine(args=args, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ +[default2]: super().__init__(*super_args, **super_kwargs) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ +[default2]: self._do_args_sanity_check(args) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check +[default2]: assert ( +[default2]:AssertionError: Mismatch in local rank setting, args.local_rank=6 but env['LOCAL_RANK']=2. +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer +[default1]: model, optimizer, _, lr_scheduler = deepspeed.initialize( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize +[default1]: engine = PipelineEngine(args=args, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ +[default1]: super().__init__(*super_args, **super_kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ +[default1]: self._do_args_sanity_check(args) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check +[default1]: assert ( +[default1]:AssertionError: Mismatch in local rank setting, args.local_rank=5 but env['LOCAL_RANK']=1. +[default3]:Traceback (most recent call last): +[default2]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer +[default2]: model, optimizer, _, lr_scheduler = deepspeed.initialize( +[default3]: model, optimizer, _, lr_scheduler = deepspeed.initialize( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize +[default2]: engine = PipelineEngine(args=args, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ +[default3]: engine = PipelineEngine(args=args, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ +[default3]: super().__init__(*super_args, **super_kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ +[default3]: self._do_args_sanity_check(args) +[default2]: super().__init__(*super_args, **super_kwargs) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ +[default2]: self._do_args_sanity_check(args) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check +[default2]: assert ( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check +[default3]: assert ( +[default3]:AssertionError: Mismatch in local rank setting, args.local_rank=7 but env['LOCAL_RANK']=3. +[default2]:AssertionError: Mismatch in local rank setting, args.local_rank=6 but env['LOCAL_RANK']=2. +[default1]:Traceback (most recent call last): +[default0]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer +[default1]: model, optimizer, _, lr_scheduler = deepspeed.initialize( +[default0]: model, optimizer, _, lr_scheduler = deepspeed.initialize( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize +[default1]: engine = PipelineEngine(args=args, +[default0]: engine = PipelineEngine(args=args, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ +[default1]: super().__init__(*super_args, **super_kwargs) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ +[default0]: super().__init__(*super_args, **super_kwargs) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ +[default0]: self._do_args_sanity_check(args) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check +[default0]: assert ( +[default0]:AssertionError: Mismatch in local rank setting, args.local_rank=4 but env['LOCAL_RANK']=0. +[default1]: self._do_args_sanity_check(args) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check +[default1]: assert ( +[default1]:AssertionError: Mismatch in local rank setting, args.local_rank=5 but env['LOCAL_RANK']=1. +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer +[default0]: model, optimizer, _, lr_scheduler = deepspeed.initialize( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize +[default0]: engine = PipelineEngine(args=args, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ +[default0]: super().__init__(*super_args, **super_kwargs) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ +[default0]: self._do_args_sanity_check(args) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check +[default0]: assert ( +[default0]:AssertionError: Mismatch in local rank setting, args.local_rank=4 but env['LOCAL_RANK']=0. +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer +[default3]: model, optimizer, _, lr_scheduler = deepspeed.initialize( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize +[default3]: engine = PipelineEngine(args=args, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ +[default3]: super().__init__(*super_args, **super_kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ +[default3]: self._do_args_sanity_check(args) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check +[default3]: assert ( +[default3]:AssertionError: Mismatch in local rank setting, args.local_rank=7 but env['LOCAL_RANK']=3. +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2859764) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 73111) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 318.214070558548 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:23:28 + host : jean-zay-iam23-ib0 + rank : 5 (local_rank: 1) + exitcode : 1 (pid: 2859765) + error_file: /tmp/torchelastic_9fa93ddc/none_jseazjdl/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer + model, optimizer, _, lr_scheduler = deepspeed.initialize( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize + engine = PipelineEngine(args=args, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ + super().__init__(*super_args, **super_kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ + self._do_args_sanity_check(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check + assert ( + AssertionError: Mismatch in local rank setting, args.local_rank=5 but env['LOCAL_RANK']=1. + +[2]: + time : 2022-10-06_12:23:28 + host : jean-zay-iam23-ib0 + rank : 6 (local_rank: 2) + exitcode : 1 (pid: 2859766) + error_file: /tmp/torchelastic_9fa93ddc/none_jseazjdl/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer + model, optimizer, _, lr_scheduler = deepspeed.initialize( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize + engine = PipelineEngine(args=args, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ + super().__init__(*super_args, **super_kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ + self._do_args_sanity_check(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check + assert ( + AssertionError: Mismatch in local rank setting, args.local_rank=6 but env['LOCAL_RANK']=2. + +[3]: + time : 2022-10-06_12:23:28 + host : jean-zay-iam23-ib0 + rank : 7 (local_rank: 3) + exitcode : 1 (pid: 2859767) + error_file: /tmp/torchelastic_9fa93ddc/none_jseazjdl/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer + model, optimizer, _, lr_scheduler = deepspeed.initialize( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize + engine = PipelineEngine(args=args, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ + super().__init__(*super_args, **super_kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ + self._do_args_sanity_check(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check + assert ( + AssertionError: Mismatch in local rank setting, args.local_rank=7 but env['LOCAL_RANK']=3. + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:23:28 + host : jean-zay-iam23-ib0 + rank : 4 (local_rank: 0) + exitcode : 1 (pid: 2859764) + error_file: /tmp/torchelastic_9fa93ddc/none_jseazjdl/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer + model, optimizer, _, lr_scheduler = deepspeed.initialize( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize + engine = PipelineEngine(args=args, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ + super().__init__(*super_args, **super_kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ + self._do_args_sanity_check(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check + assert ( + AssertionError: Mismatch in local rank setting, args.local_rank=4 but env['LOCAL_RANK']=0. + +============================================================ +ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 318.5136456489563 seconds +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 906, in _exit_barrier + store_util.barrier( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 78, in barrier + synchronize(store, data, rank, world_size, key_prefix, barrier_timeout) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize + agent_data = get_all(store, rank, key_prefix, world_size) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all + data = store.get(f"{prefix}{idx}") +RuntimeError: Socket Timeout +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:23:28 + host : jean-zay-iam25-ib0 + rank : 13 (local_rank: 1) + exitcode : 1 (pid: 73112) + error_file: /tmp/torchelastic_yu68bo9w/none_v0a508l3/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer + model, optimizer, _, lr_scheduler = deepspeed.initialize( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize + engine = PipelineEngine(args=args, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ + super().__init__(*super_args, **super_kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ + self._do_args_sanity_check(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check + assert ( + AssertionError: Mismatch in local rank setting, args.local_rank=5 but env['LOCAL_RANK']=1. + +[2]: + time : 2022-10-06_12:23:28 + host : jean-zay-iam25-ib0 + rank : 14 (local_rank: 2) + exitcode : 1 (pid: 73113) + error_file: /tmp/torchelastic_yu68bo9w/none_v0a508l3/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer + model, optimizer, _, lr_scheduler = deepspeed.initialize( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize + engine = PipelineEngine(args=args, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ + super().__init__(*super_args, **super_kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ + self._do_args_sanity_check(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check + assert ( + AssertionError: Mismatch in local rank setting, args.local_rank=6 but env['LOCAL_RANK']=2. + +[3]: + time : 2022-10-06_12:23:28 + host : jean-zay-iam25-ib0 + rank : 15 (local_rank: 3) + exitcode : 1 (pid: 73114) + error_file: /tmp/torchelastic_yu68bo9w/none_v0a508l3/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer + model, optimizer, _, lr_scheduler = deepspeed.initialize( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize + engine = PipelineEngine(args=args, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ + super().__init__(*super_args, **super_kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ + self._do_args_sanity_check(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check + assert ( + AssertionError: Mismatch in local rank setting, args.local_rank=7 but env['LOCAL_RANK']=3. + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:23:28 + host : jean-zay-iam25-ib0 + rank : 12 (local_rank: 0) + exitcode : 1 (pid: 73111) + error_file: /tmp/torchelastic_yu68bo9w/none_v0a508l3/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 424, in setup_model_and_optimizer + model, optimizer, _, lr_scheduler = deepspeed.initialize( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/__init__.py", line 137, in initialize + engine = PipelineEngine(args=args, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/pipe/engine.py", line 59, in __init__ + super().__init__(*super_args, **super_kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 261, in __init__ + self._do_args_sanity_check(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 906, in _do_args_sanity_check + assert ( + AssertionError: Mismatch in local rank setting, args.local_rank=4 but env['LOCAL_RANK']=0. + +============================================================ +srun: error: jean-zay-iam23: task 1: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2075085.0 +slurmstepd: error: *** STEP 2075085.0 ON jean-zay-iam22 CANCELLED AT 2022-10-06T12:28:52 *** +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3724263 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3724264 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 879160 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 879161 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3724265 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3724266 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 879162 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 879163 closing signal SIGTERM +srun: error: jean-zay-iam25: task 3: Exited with exit code 1 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 879120 got signal: 15 +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 236, in launch_agent + result = agent.run() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 125, in wrapper + result = f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run + result = self._invoke_run(role) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 850, in _invoke_run + time.sleep(monitor_interval) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 60, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 3724221 got signal: 15 +srun: error: jean-zay-iam24: task 2: Exited with exit code 1 +srun: error: jean-zay-iam22: task 0: Exited with exit code 1 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:using world size: 32, data-parallel-size: 32, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 32 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2075229.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 10 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 4096 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 1024 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 1024 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/kill-switch-tr13e-350M-mtf +[default0]: kv_channels ..................................... 64 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 1 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 1 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/tr13e-350M-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 32 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 32 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-06 12:34:43,630] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default7]:> setting tensorboard ... +[default0]:> initializing tensor model parallel with size 1 +[default0]:> initializing pipeline model parallel with size 1 +[default0]:> setting random seeds to 42 ... +[default0]:[2022-10-06 12:34:45,906] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 +[default0]:> compiling dataset index builder ... +[default0]:make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/data' +[default0]:make: Nothing to be done for 'default'. +[default0]:make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/data' +[default0]:>>> done with dataset index builder. Compilation time: 0.109 seconds +[default0]:> compiling and loading fused kernels ... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module fused_mix_prec_layer_norm_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module fused_mix_prec_layer_norm_cuda... +[default0]:>>> done with compiling and loading fused kernels. Compilation time: 6.037 seconds +[default0]:time to initialize megatron (seconds): -51.945 +[default0]:[after megatron is initialized] datetime: 2022-10-06 12:34:52 +[default0]:building GPT model ... +[default0]:[2022-10-06 12:34:52,108] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2022-10-06 12:34:52,108] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +[default0]:[2022-10-06 12:34:52,108] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.25 GB, percent = 6.6% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pipe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31} +[default0]:[2022-10-06 12:34:53,073] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer|embedding +[default0]:stage=0 layers=31 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: ParallelTransformerLayerPipe +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: undo +[default0]: 28: MixedFusedLayerNorm +[default0]: 29: EmbeddingPipe +[default0]: 30: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2022-10-06 12:34:53,169] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2022-10-06 12:34:53,169] [INFO] [utils.py:828:see_memory_usage] MA 1.04 GB Max_MA 1.04 GB CA 1.09 GB Max_CA 1 GB +[default0]:[2022-10-06 12:34:53,170] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.36 GB, percent = 6.6% +[default0]:setting training iterations to 6200 +[default0]:> learning rate decay style: constant +[default0]:DeepSpeed is enabled. +[default0]:[2022-10-06 12:34:53,172] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.1+8b2a6371, git-hash=8b2a6371, git-branch=master +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-06 12:34:55,473] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[default0]:[2022-10-06 12:34:55,473] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +[default0]:[2022-10-06 12:34:55,473] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[default0]:[2022-10-06 12:34:55,481] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +[default0]:[2022-10-06 12:34:55,481] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[default0]:[2022-10-06 12:34:55,481] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer +[default0]:[2022-10-06 12:34:55,481] [INFO] [stage_1_and_2.py:134:__init__] Reduce bucket size 500000000 +[default0]:[2022-10-06 12:34:55,481] [INFO] [stage_1_and_2.py:135:__init__] Allgather bucket size 500000000 +[default0]:[2022-10-06 12:34:55,481] [INFO] [stage_1_and_2.py:136:__init__] CPU Offload: False +[default0]:[2022-10-06 12:34:55,481] [INFO] [stage_1_and_2.py:137:__init__] Round robin gradient partitioning: False +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Emitting ninja build file /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113/utils/build.ninja... +[default6]:Building extension module utils... +[default6]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6324665546417236 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.6324448585510254 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6323952674865723 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.6332042217254639 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.6327109336853027 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.6318850517272949 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6261317729949951 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.6322071552276611 seconds +[default7]:Loading extension module utils... +[default6]:Loading extension module utils... +[default7]:Time to load utils op: 0.6261217594146729 seconds +[default6]:Time to load utils op: 0.6261289119720459 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.6261425018310547 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.6261374950408936 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6261475086212158 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.6261444091796875 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.6261396408081055 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6257457733154297 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.6251287460327148 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.6247873306274414 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6247265338897705 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.6246633529663086 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.632662296295166 seconds +[default6]:ninja: no work to do. +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.6249279975891113 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.6249122619628906 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.6251258850097656 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.6400249004364014 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.6400132179260254 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.6372742652893066 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.6376442909240723 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.637458086013794 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.6355366706848145 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.6388750076293945 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.6364305019378662 seconds +[default0]:Rank: 16 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 17 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 20 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 21 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 5 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 24 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 25 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 26 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 4 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 28 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 30 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 31 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 27 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 29 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 2 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 0 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 1 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 7 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 3 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 6 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 12 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 13 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 18 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 19 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 14 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 15 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 23 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 22 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 10 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 11 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 9 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 8 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0005996227264404297 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0005893707275390625 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0011677742004394531 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0012524127960205078 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0013470649719238281 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Time to load utils op: 0.0014061927795410156 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0009000301361083984 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.001474142074584961 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.000942230224609375 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Loading extension module utils... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0009028911590576172 seconds +[default4]:Time to load utils op: 0.001024484634399414 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default2]:Time to load utils op: 0.0007586479187011719 seconds +[default1]:Time to load utils op: 0.0010080337524414062 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.002629518508911133 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.001310586929321289 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0009675025939941406 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0009298324584960938 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0009920597076416016 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.001445770263671875 seconds +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0009496212005615234 seconds +[default4]:Time to load utils op: 0.0011324882507324219 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default2]:Time to load utils op: 0.0013427734375 seconds +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0014224052429199219 seconds +[default3]:Time to load utils op: 0.0014157295227050781 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0005400180816650391 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0006086826324462891 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0005822181701660156 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Time to load utils op: 0.0005123615264892578 seconds +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.00051116943359375 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0010690689086914062 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0005574226379394531 seconds +[default0]:[2022-10-06 12:35:00,082] [INFO] [utils.py:827:see_memory_usage] Before initializing optimizer states +[default0]:[2022-10-06 12:35:00,082] [INFO] [utils.py:828:see_memory_usage] MA 1.11 GB Max_MA 1.12 GB CA 1.84 GB Max_CA 2 GB +[default0]:[2022-10-06 12:35:00,083] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.22 GB, percent = 7.2% +[default0]:[2022-10-06 12:35:00,123] [INFO] [utils.py:827:see_memory_usage] After initializing optimizer states +[default0]:[2022-10-06 12:35:00,124] [INFO] [utils.py:828:see_memory_usage] MA 1.24 GB Max_MA 1.3 GB CA 1.89 GB Max_CA 2 GB +[default0]:[2022-10-06 12:35:00,124] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.22 GB, percent = 7.2% +[default0]:[2022-10-06 12:35:00,124] [INFO] [stage_1_and_2.py:516:__init__] optimizer state initialized +[default0]:[2022-10-06 12:35:00,153] [INFO] [utils.py:827:see_memory_usage] After initializing ZeRO optimizer +[default0]:[2022-10-06 12:35:00,153] [INFO] [utils.py:828:see_memory_usage] MA 1.24 GB Max_MA 1.24 GB CA 1.89 GB Max_CA 2 GB +[default0]:[2022-10-06 12:35:00,153] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.22 GB, percent = 7.2% +[default0]:[2022-10-06 12:35:00,153] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[default0]:[2022-10-06 12:35:00,153] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[default0]:[2022-10-06 12:35:00,154] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[default0]:[2022-10-06 12:35:00,154] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default0]:[2022-10-06 12:35:00,154] [INFO] [config.py:987:print] DeepSpeedEngine configuration: +[default0]:[2022-10-06 12:35:00,154] [INFO] [config.py:991:print] activation_checkpointing_config { +[default0]: "partition_activations": false, +[default0]: "contiguous_memory_optimization": false, +[default0]: "cpu_checkpointing": false, +[default0]: "number_checkpoints": null, +[default0]: "synchronize_checkpoint_boundary": false, +[default0]: "profile": false +[default0]:} +[default0]:[2022-10-06 12:35:00,154] [INFO] [config.py:991:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[default0]:[2022-10-06 12:35:00,154] [INFO] [config.py:991:print] amp_enabled .................. False +[default0]:[2022-10-06 12:35:00,154] [INFO] [config.py:991:print] amp_params ................... False +[default0]:[2022-10-06 12:35:00,154] [INFO] [config.py:991:print] autotuning_config ............ { +[default0]: "enabled": false, +[default0]: "start_step": null, +[default0]: "end_step": null, +[default0]: "metric_path": null, +[default0]: "arg_mappings": null, +[default0]: "metric": "throughput", +[default0]: "model_info": null, +[default0]: "results_dir": null, +[default0]: "exps_dir": null, +[default0]: "overwrite": true, +[default0]: "fast": true, +[default0]: "start_profile_step": 3, +[default0]: "end_profile_step": 5, +[default0]: "tuner_type": "gridsearch", +[default0]: "tuner_early_stopping": 5, +[default0]: "tuner_num_trials": 50, +[default0]: "model_info_path": null, +[default0]: "mp_size": 1, +[default0]: "max_train_batch_size": null, +[default0]: "min_train_batch_size": 1, +[default0]: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +[default0]: "min_train_micro_batch_size_per_gpu": 1, +[default0]: "num_tuning_micro_batch_sizes": 3 +[default0]:} +[default0]:[2022-10-06 12:35:00,154] [INFO] [config.py:991:print] bfloat16_enabled ............. False +[default0]:[2022-10-06 12:35:00,154] [INFO] [config.py:991:print] checkpoint_tag_validation_enabled True +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] checkpoint_tag_validation_fail False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] comms_config ................. +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] communication_data_type ...... None +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] curriculum_enabled ........... False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] curriculum_params ............ False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] dataloader_drop_last ......... False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] disable_allgather ............ False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] dump_state ................... False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] eigenvalue_enabled ........... False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] eigenvalue_gas_boundary_resolution 1 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] eigenvalue_layer_name ........ bert.encoder.layer +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] eigenvalue_layer_num ......... 0 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] eigenvalue_max_iter .......... 100 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] eigenvalue_stability ......... 1e-06 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] eigenvalue_tol ............... 0.01 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] eigenvalue_verbose ........... False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] elasticity_enabled ........... False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] flops_profiler_config ........ { +[default0]: "enabled": false, +[default0]: "profile_step": 1, +[default0]: "module_depth": -1, +[default0]: "top_modules": 1, +[default0]: "detailed": true, +[default0]: "output_file": null +[default0]:} +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] fp16_auto_cast ............... False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] fp16_enabled ................. True +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] fp16_master_weights_and_gradients False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] global_rank .................. 0 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] gradient_accumulation_steps .. 32 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] gradient_clipping ............ 1.0 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] gradient_predivide_factor .... 1.0 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] initial_dynamic_scale ........ 4096 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] load_universal_checkpoint .... False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] loss_scale ................... 0 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] memory_breakdown ............. False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] monitor_config ............... +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] nebula_config ................ { +[default0]: "enabled": false, +[default0]: "persistent_storage_path": null, +[default0]: "persistent_time_interval": 100, +[default0]: "num_of_version_in_retention": 2, +[default0]: "enable_nebula_load": true, +[default0]: "load_path": null +[default0]:} +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] optimizer_legacy_fusion ...... False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] optimizer_name ............... None +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] optimizer_params ............. None +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] pld_enabled .................. False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] pld_params ................... False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] prescale_gradients ........... False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] scheduler_name ............... None +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] scheduler_params ............. None +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] sparse_attention ............. None +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] sparse_gradients_enabled ..... False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] steps_per_print .............. 2000 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] train_batch_size ............. 1024 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] train_micro_batch_size_per_gpu 1 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] wall_clock_breakdown ......... False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] world_size ................... 32 +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] zero_allow_untested_optimizer False +[default0]:[2022-10-06 12:35:00,155] [INFO] [config.py:991:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +[default0]:[2022-10-06 12:35:00,156] [INFO] [config.py:991:print] zero_enabled ................. True +[default0]:[2022-10-06 12:35:00,156] [INFO] [config.py:991:print] zero_optimization_stage ...... 1 +[default0]:[2022-10-06 12:35:00,156] [INFO] [config.py:976:print_user_config] json = { +[default0]: "train_micro_batch_size_per_gpu": 1, +[default0]: "train_batch_size": 1.024000e+03, +[default0]: "gradient_clipping": 1.0, +[default0]: "zero_optimization": { +[default0]: "stage": 1 +[default0]: }, +[default0]: "fp16": { +[default0]: "enabled": true, +[default0]: "loss_scale": 0, +[default0]: "loss_scale_window": 500, +[default0]: "hysteresis": 2, +[default0]: "min_loss_scale": 1, +[default0]: "initial_scale_power": 12 +[default0]: }, +[default0]: "steps_per_print": 2.000000e+03, +[default0]: "wall_clock_breakdown": false +[default0]:} +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.00046706199645996094 seconds +[default0]:[2022-10-06 12:35:00,156] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=32 micro_batch_size=1 +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default4]: assert len(self.ckpt_list) > 0 +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]:AssertionError +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default3]: assert len(self.ckpt_list) > 0 +[default3]:AssertionError +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default2]: assert len(self.ckpt_list) > 0 +[default2]:AssertionError +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default0]: assert len(self.ckpt_list) > 0 +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default2]: assert len(self.ckpt_list) > 0 +[default2]:AssertionError +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default0]: assert len(self.ckpt_list) > 0 +[default1]: assert len(self.ckpt_list) > 0 +[default1]:AssertionError +[default0]:AssertionError +[default4]:Traceback (most recent call last): +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default1]: assert len(self.ckpt_list) > 0 +[default1]:AssertionError +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default4]: assert len(self.ckpt_list) > 0 +[default4]:AssertionError +[default2]:Traceback (most recent call last): +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default2]: assert len(self.ckpt_list) > 0 +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]:AssertionError +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default0]: assert len(self.ckpt_list) > 0 +[default3]: assert len(self.ckpt_list) > 0 +[default3]:AssertionError +[default0]:AssertionError +[default0]:[2022-10-06 12:35:00,285] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=31 [0, 31) STAGE_PARAMS=559214592 (559.215M) TOTAL_PARAMS=559214592 (559.215M) UNIQUE_PARAMS=559214592 (559.215M) +[default7]:Traceback (most recent call last): +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default7]: assert len(self.ckpt_list) > 0 +[default7]:AssertionError +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default6]: assert len(self.ckpt_list) > 0 +[default6]:AssertionError +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default5]: assert len(self.ckpt_list) > 0 +[default5]:AssertionError +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default0]: assert len(self.ckpt_list) > 0 +[default0]:AssertionError +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]:Traceback (most recent call last): +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default5]: assert len(self.ckpt_list) > 0 +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]:AssertionError +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]:Traceback (most recent call last): +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: main() +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: assert len(self.ckpt_list) > 0 +[default4]:AssertionError +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default6]: main() +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default7]: assert len(self.ckpt_list) > 0 +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]:AssertionError +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: assert len(self.ckpt_list) > 0 +[default1]:AssertionError +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default2]: assert len(self.ckpt_list) > 0 +[default2]:AssertionError +[default3]: assert len(self.ckpt_list) > 0 +[default3]:AssertionError +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default6]: assert len(self.ckpt_list) > 0 +[default6]:AssertionError +[default0]:AssertionError +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default1]: assert len(self.ckpt_list) > 0 +[default1]:AssertionError +[default7]:Traceback (most recent call last): +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default6]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default7]: assert len(self.ckpt_list) > 0 +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default6]: assert len(self.ckpt_list) > 0 +[default7]:AssertionError +[default6]:AssertionError +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default5]: assert len(self.ckpt_list) > 0 +[default5]:AssertionError +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default3]: assert len(self.ckpt_list) > 0 +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default3]:AssertionError +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default4]: assert len(self.ckpt_list) > 0 +[default4]:AssertionError +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default7]: assert len(self.ckpt_list) > 0 +[default7]:AssertionError +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default6]: assert len(self.ckpt_list) > 0 +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default5]: assert len(self.ckpt_list) > 0 +[default5]:AssertionError +[default6]:AssertionError +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 692706) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3540781) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3156467) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 282215) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam14-ib0 + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 282216) + error_file: /tmp/torchelastic_hhx6lhcw/none_b220g9yp/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[2]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam14-ib0 + rank : 2 (local_rank: 2) + exitcode : 1 (pid: 282217) + error_file: /tmp/torchelastic_hhx6lhcw/none_b220g9yp/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[3]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam14-ib0 + rank : 3 (local_rank: 3) + exitcode : 1 (pid: 282218) + error_file: /tmp/torchelastic_hhx6lhcw/none_b220g9yp/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + return _run_code(code, main_globals, None, + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[4]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam14-ib0 + rank : 4 (local_rank: 4) + exitcode : 1 (pid: 282219) + error_file: /tmp/torchelastic_hhx6lhcw/none_b220g9yp/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[5]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam14-ib0 + rank : 5 (local_rank: 5) + exitcode : 1 (pid: 282220) + error_file: /tmp/torchelastic_hhx6lhcw/none_b220g9yp/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[6]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam14-ib0 + rank : 6 (local_rank: 6) + exitcode : 1 (pid: 282221) + error_file: /tmp/torchelastic_hhx6lhcw/none_b220g9yp/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[7]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam14-ib0 + rank : 7 (local_rank: 7) + exitcode : 1 (pid: 282222) + error_file: /tmp/torchelastic_hhx6lhcw/none_b220g9yp/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam14-ib0 + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 282215) + error_file: /tmp/torchelastic_hhx6lhcw/none_b220g9yp/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +============================================================ + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam17-ib0 + rank : 25 (local_rank: 1) + exitcode : 1 (pid: 3156468) + error_file: /tmp/torchelastic_4fjv5z6e/none_7hlq6jgq/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[2]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam17-ib0 + rank : 26 (local_rank: 2) + exitcode : 1 (pid: 3156469) + error_file: /tmp/torchelastic_4fjv5z6e/none_7hlq6jgq/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[3]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam17-ib0 + rank : 27 (local_rank: 3) + exitcode : 1 (pid: 3156470) + error_file: /tmp/torchelastic_4fjv5z6e/none_7hlq6jgq/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[4]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam17-ib0 + rank : 28 (local_rank: 4) + exitcode : 1 (pid: 3156471) + error_file: /tmp/torchelastic_4fjv5z6e/none_7hlq6jgq/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[5]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam17-ib0 + rank : 29 (local_rank: 5) + exitcode : 1 (pid: 3156472) + error_file: /tmp/torchelastic_4fjv5z6e/none_7hlq6jgq/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[6]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam17-ib0 + rank : 30 (local_rank: 6) + exitcode : 1 (pid: 3156473) + error_file: /tmp/torchelastic_4fjv5z6e/none_7hlq6jgq/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[7]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam17-ib0 + rank : 31 (local_rank: 7) + exitcode : 1 (pid: 3156474) + error_file: /tmp/torchelastic_4fjv5z6e/none_7hlq6jgq/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam17-ib0 + rank : 24 (local_rank: 0) + exitcode : 1 (pid: 3156467) + error_file: /tmp/torchelastic_4fjv5z6e/none_7hlq6jgq/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +============================================================ + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam15-ib0 + rank : 9 (local_rank: 1) + exitcode : 1 (pid: 692707) + error_file: /tmp/torchelastic_4_t82l8u/none_18xxpxiq/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[2]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam15-ib0 + rank : 10 (local_rank: 2) + exitcode : 1 (pid: 692708) + error_file: /tmp/torchelastic_4_t82l8u/none_18xxpxiq/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[3]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam15-ib0 + rank : 11 (local_rank: 3) + exitcode : 1 (pid: 692709) + error_file: /tmp/torchelastic_4_t82l8u/none_18xxpxiq/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[4]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam15-ib0 + rank : 12 (local_rank: 4) + exitcode : 1 (pid: 692710) + error_file: /tmp/torchelastic_4_t82l8u/none_18xxpxiq/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[5]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam15-ib0 + rank : 13 (local_rank: 5) + exitcode : 1 (pid: 692711) + error_file: /tmp/torchelastic_4_t82l8u/none_18xxpxiq/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam16-ib0 + rank : 17 (local_rank: 1) + exitcode : 1 (pid: 3540782) + error_file: /tmp/torchelastic_tkr0d1b1/none_6x3dosd3/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[2]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam16-ib0 + rank : 18 (local_rank: 2) + exitcode : 1 (pid: 3540783) + error_file: /tmp/torchelastic_tkr0d1b1/none_6x3dosd3/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[3]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam16-ib0 + rank : 19 (local_rank: 3) + exitcode : 1 (pid: 3540784) + error_file: /tmp/torchelastic_tkr0d1b1/none_6x3dosd3/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[4]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam16-ib0 + rank : 20 (local_rank: 4) + exitcode : 1 (pid: 3540785) + error_file: /tmp/torchelastic_tkr0d1b1/none_6x3dosd3/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[5]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam16-ib0 + rank : 21 (local_rank: 5) + exitcode : 1 (pid: 3540788) + error_file: /tmp/torchelastic_tkr0d1b1/none_6x3dosd3/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[6]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam16-ib0 + rank : 22 (local_rank: 6) + exitcode : 1 (pid: 3540789) + error_file: /tmp/torchelastic_tkr0d1b1/none_6x3dosd3/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[7]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam16-ib0 + rank : 23 (local_rank: 7) + exitcode : 1 (pid: 3540790) + error_file: /tmp/torchelastic_tkr0d1b1/none_6x3dosd3/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam16-ib0 + rank : 16 (local_rank: 0) + exitcode : 1 (pid: 3540781) + error_file: /tmp/torchelastic_tkr0d1b1/none_6x3dosd3/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +============================================================ + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[6]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam15-ib0 + rank : 14 (local_rank: 6) + exitcode : 1 (pid: 692712) + error_file: /tmp/torchelastic_4_t82l8u/none_18xxpxiq/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[7]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam15-ib0 + rank : 15 (local_rank: 7) + exitcode : 1 (pid: 692713) + error_file: /tmp/torchelastic_4_t82l8u/none_18xxpxiq/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:35:00 + host : jean-zay-iam15-ib0 + rank : 8 (local_rank: 0) + exitcode : 1 (pid: 692706) + error_file: /tmp/torchelastic_4_t82l8u/none_18xxpxiq/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +============================================================ +srun: error: jean-zay-iam14: task 0: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2075229.0 +srun: error: jean-zay-iam16: task 2: Exited with exit code 1 +srun: error: jean-zay-iam15: task 1: Exited with exit code 1 +srun: error: jean-zay-iam17: task 3: Exited with exit code 1 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:Offline mode: forcing local_files_only=True +[default0]:using world size: 32, data-parallel-size: 32, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 32 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2075256.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 10 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 4096 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 1024 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 1024 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/kill-switch-tr13e-350M-mtf +[default0]: kv_channels ..................................... 64 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 1 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 1 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/tr13e-350M-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 32 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 32 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-06 12:37:35,993] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default7]:> setting tensorboard ... +[default0]:> initializing tensor model parallel with size 1 +[default0]:> initializing pipeline model parallel with size 1 +[default0]:> setting random seeds to 42 ... +[default0]:[2022-10-06 12:37:38,203] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 +[default0]:> compiling dataset index builder ... +[default0]:make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/data' +[default0]:make: Nothing to be done for 'default'. +[default0]:make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/data' +[default0]:>>> done with dataset index builder. Compilation time: 0.057 seconds +[default0]:> compiling and loading fused kernels ... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module fused_mix_prec_layer_norm_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module fused_mix_prec_layer_norm_cuda... +[default0]:>>> done with compiling and loading fused kernels. Compilation time: 5.841 seconds +[default0]:time to initialize megatron (seconds): -7.896 +[default0]:[after megatron is initialized] datetime: 2022-10-06 12:37:44 +[default0]:building GPT model ... +[default0]:[2022-10-06 12:37:44,149] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2022-10-06 12:37:44,150] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +[default0]:[2022-10-06 12:37:44,150] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.27 GB, percent = 6.6% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pipe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31} +[default0]:[2022-10-06 12:37:45,114] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer|embedding +[default0]:stage=0 layers=31 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: ParallelTransformerLayerPipe +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: undo +[default0]: 28: MixedFusedLayerNorm +[default0]: 29: EmbeddingPipe +[default0]: 30: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2022-10-06 12:37:45,220] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2022-10-06 12:37:45,221] [INFO] [utils.py:828:see_memory_usage] MA 1.04 GB Max_MA 1.04 GB CA 1.09 GB Max_CA 1 GB +[default0]:[2022-10-06 12:37:45,221] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.37 GB, percent = 6.6% +[default0]:setting training iterations to 6200 +[default0]:> learning rate decay style: constant +[default0]:DeepSpeed is enabled. +[default0]:[2022-10-06 12:37:45,223] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.1+8b2a6371, git-hash=8b2a6371, git-branch=master +[default0]:[2022-10-06 12:37:47,383] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[default0]:[2022-10-06 12:37:47,384] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +[default0]:[2022-10-06 12:37:47,384] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-06 12:37:47,392] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +[default0]:[2022-10-06 12:37:47,392] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[default0]:[2022-10-06 12:37:47,392] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer +[default0]:[2022-10-06 12:37:47,392] [INFO] [stage_1_and_2.py:134:__init__] Reduce bucket size 500000000 +[default0]:[2022-10-06 12:37:47,392] [INFO] [stage_1_and_2.py:135:__init__] Allgather bucket size 500000000 +[default0]:[2022-10-06 12:37:47,392] [INFO] [stage_1_and_2.py:136:__init__] CPU Offload: False +[default0]:[2022-10-06 12:37:47,392] [INFO] [stage_1_and_2.py:137:__init__] Round robin gradient partitioning: False +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Emitting ninja build file /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113/utils/build.ninja... +[default4]:Building extension module utils... +[default4]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default4]:ninja: no work to do. +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.2084357738494873 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.20857810974121094 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.20836973190307617 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.20828819274902344 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.20837140083312988 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.20823287963867188 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.20824527740478516 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.20851922035217285 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.23664307594299316 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.23658442497253418 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.23609089851379395 seconds +[default3]:Loading extension module utils... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.23621630668640137 seconds +[default3]:Time to load utils op: 0.2365550994873047 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.23757529258728027 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.23544621467590332 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.23750019073486328 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.23729896545410156 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.2359778881072998 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.23680734634399414 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.23767328262329102 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.23737049102783203 seconds +[default7]:Loading extension module utils... +[default6]:Loading extension module utils... +[default7]:Time to load utils op: 0.23709416389465332 seconds +[default6]:Time to load utils op: 0.23734450340270996 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.23731017112731934 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.311312198638916 seconds +[default3]:Loading extension module utils... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.30907344818115234 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.2330160140991211 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.3085329532623291 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.3073413372039795 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.31191539764404297 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.31229472160339355 seconds +[default3]:Time to load utils op: 0.31102800369262695 seconds +[default0]:Rank: 24 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 25 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 31 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 30 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 19 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 18 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 4 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 3 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 6 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 8 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 2 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 5 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 7 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 10 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 9 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 14 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 13 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 15 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 11 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 12 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 28 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 29 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 21 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 20 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 26 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 27 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 17 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 16 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 0 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 1 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0017809867858886719 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.001483917236328125 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0017583370208740234 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.001890420913696289 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0015456676483154297 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0016336441040039062 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0016236305236816406 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0016565322875976562 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.001934051513671875 seconds +[default0]:[2022-10-06 12:37:49,864] [INFO] [utils.py:827:see_memory_usage] Before initializing optimizer states +[default0]:[2022-10-06 12:37:49,864] [INFO] [utils.py:828:see_memory_usage] MA 1.11 GB Max_MA 1.12 GB CA 1.84 GB Max_CA 2 GB +[default0]:[2022-10-06 12:37:49,865] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.5 GB, percent = 7.3% +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0013928413391113281 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.001463174819946289 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0013244152069091797 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.001190185546875 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0015888214111328125 seconds +[default6]:Rank: 22 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Rank: 23 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0014522075653076172 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0013012886047363281 seconds +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0016820430755615234 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0016210079193115234 seconds +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0014770030975341797 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0021038055419921875 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0015871524810791016 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0016186237335205078 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0015978813171386719 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0022046566009521484 seconds +[default7]:Time to load utils op: 0.0017528533935546875 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0018787384033203125 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0015454292297363281 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0015192031860351562 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.001554250717163086 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0015082359313964844 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0013971328735351562 seconds +[default0]:[2022-10-06 12:37:49,905] [INFO] [utils.py:827:see_memory_usage] After initializing optimizer states +[default0]:[2022-10-06 12:37:49,905] [INFO] [utils.py:828:see_memory_usage] MA 1.24 GB Max_MA 1.3 GB CA 1.89 GB Max_CA 2 GB +[default0]:[2022-10-06 12:37:49,906] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.5 GB, percent = 7.3% +[default0]:[2022-10-06 12:37:49,906] [INFO] [stage_1_and_2.py:516:__init__] optimizer state initialized +[default0]:[2022-10-06 12:37:49,932] [INFO] [utils.py:827:see_memory_usage] After initializing ZeRO optimizer +[default0]:[2022-10-06 12:37:49,933] [INFO] [utils.py:828:see_memory_usage] MA 1.24 GB Max_MA 1.24 GB CA 1.89 GB Max_CA 2 GB +[default0]:[2022-10-06 12:37:49,933] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.5 GB, percent = 7.3% +[default0]:[2022-10-06 12:37:49,933] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[default0]:[2022-10-06 12:37:49,933] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[default0]:[2022-10-06 12:37:49,933] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[default0]:[2022-10-06 12:37:49,933] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:987:print] DeepSpeedEngine configuration: +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] activation_checkpointing_config { +[default0]: "partition_activations": false, +[default0]: "contiguous_memory_optimization": false, +[default0]: "cpu_checkpointing": false, +[default0]: "number_checkpoints": null, +[default0]: "synchronize_checkpoint_boundary": false, +[default0]: "profile": false +[default0]:} +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] amp_enabled .................. False +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] amp_params ................... False +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] autotuning_config ............ { +[default0]: "enabled": false, +[default0]: "start_step": null, +[default0]: "end_step": null, +[default0]: "metric_path": null, +[default0]: "arg_mappings": null, +[default0]: "metric": "throughput", +[default0]: "model_info": null, +[default0]: "results_dir": null, +[default0]: "exps_dir": null, +[default0]: "overwrite": true, +[default0]: "fast": true, +[default0]: "start_profile_step": 3, +[default0]: "end_profile_step": 5, +[default0]: "tuner_type": "gridsearch", +[default0]: "tuner_early_stopping": 5, +[default0]: "tuner_num_trials": 50, +[default0]: "model_info_path": null, +[default0]: "mp_size": 1, +[default0]: "max_train_batch_size": null, +[default0]: "min_train_batch_size": 1, +[default0]: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +[default0]: "min_train_micro_batch_size_per_gpu": 1, +[default0]: "num_tuning_micro_batch_sizes": 3 +[default0]:} +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] bfloat16_enabled ............. False +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] checkpoint_tag_validation_enabled True +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] checkpoint_tag_validation_fail False +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] comms_config ................. +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] communication_data_type ...... None +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] curriculum_enabled ........... False +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] curriculum_params ............ False +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] dataloader_drop_last ......... False +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] disable_allgather ............ False +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] dump_state ................... False +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} +[default0]:[2022-10-06 12:37:49,934] [INFO] [config.py:991:print] eigenvalue_enabled ........... False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] eigenvalue_gas_boundary_resolution 1 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] eigenvalue_layer_name ........ bert.encoder.layer +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] eigenvalue_layer_num ......... 0 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] eigenvalue_max_iter .......... 100 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] eigenvalue_stability ......... 1e-06 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] eigenvalue_tol ............... 0.01 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] eigenvalue_verbose ........... False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] elasticity_enabled ........... False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] flops_profiler_config ........ { +[default0]: "enabled": false, +[default0]: "profile_step": 1, +[default0]: "module_depth": -1, +[default0]: "top_modules": 1, +[default0]: "detailed": true, +[default0]: "output_file": null +[default0]:} +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] fp16_auto_cast ............... False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] fp16_enabled ................. True +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] fp16_master_weights_and_gradients False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] global_rank .................. 0 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] gradient_accumulation_steps .. 32 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] gradient_clipping ............ 1.0 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] gradient_predivide_factor .... 1.0 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] initial_dynamic_scale ........ 4096 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] load_universal_checkpoint .... False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] loss_scale ................... 0 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] memory_breakdown ............. False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] monitor_config ............... +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] nebula_config ................ { +[default0]: "enabled": false, +[default0]: "persistent_storage_path": null, +[default0]: "persistent_time_interval": 100, +[default0]: "num_of_version_in_retention": 2, +[default0]: "enable_nebula_load": true, +[default0]: "load_path": null +[default0]:} +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] optimizer_legacy_fusion ...... False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] optimizer_name ............... None +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] optimizer_params ............. None +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] pld_enabled .................. False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] pld_params ................... False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] prescale_gradients ........... False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] scheduler_name ............... None +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] scheduler_params ............. None +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] sparse_attention ............. None +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] sparse_gradients_enabled ..... False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] steps_per_print .............. 2000 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] train_batch_size ............. 1024 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] train_micro_batch_size_per_gpu 1 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] wall_clock_breakdown ......... False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] world_size ................... 32 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] zero_allow_untested_optimizer False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] zero_enabled ................. True +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:991:print] zero_optimization_stage ...... 1 +[default0]:[2022-10-06 12:37:49,935] [INFO] [config.py:976:print_user_config] json = { +[default0]: "train_micro_batch_size_per_gpu": 1, +[default0]: "train_batch_size": 1.024000e+03, +[default0]: "gradient_clipping": 1.0, +[default0]: "zero_optimization": { +[default0]: "stage": 1 +[default0]: }, +[default0]: "fp16": { +[default0]: "enabled": true, +[default0]: "loss_scale": 0, +[default0]: "loss_scale_window": 500, +[default0]: "hysteresis": 2, +[default0]: "min_loss_scale": 1, +[default0]: "initial_scale_power": 12 +[default0]: }, +[default0]: "steps_per_print": 2.000000e+03, +[default0]: "wall_clock_breakdown": false +[default0]:} +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0004496574401855469 seconds +[default0]:[2022-10-06 12:37:49,936] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=32 micro_batch_size=1 +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default0]: assert len(self.ckpt_list) > 0 +[default0]:AssertionError +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: assert len(self.ckpt_list) > 0 +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]:AssertionError +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default1]: assert len(self.ckpt_list) > 0 +[default1]:AssertionError +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default4]: assert len(self.ckpt_list) > 0 +[default4]:AssertionError +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]:[2022-10-06 12:37:50,064] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=31 [0, 31) STAGE_PARAMS=559214592 (559.215M) TOTAL_PARAMS=559214592 (559.215M) UNIQUE_PARAMS=559214592 (559.215M) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default2]: assert len(self.ckpt_list) > 0 +[default2]:AssertionError +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default2]: assert len(self.ckpt_list) > 0 +[default2]:AssertionError +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: assert len(self.ckpt_list) > 0 +[default0]:AssertionError +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default1]: assert len(self.ckpt_list) > 0 +[default1]:AssertionError +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default4]: assert len(self.ckpt_list) > 0 +[default4]:AssertionError +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default3]: assert len(self.ckpt_list) > 0 +[default3]:AssertionError +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]:Traceback (most recent call last): +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default3]: assert len(self.ckpt_list) > 0 +[default3]:AssertionError +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default2]: main() +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: pretrain( +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default1]: assert len(self.ckpt_list) > 0 +[default1]:AssertionError +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default0]: assert len(self.ckpt_list) > 0 +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]:AssertionError +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default2]: self.check_ckpt_list() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default4]: assert len(self.ckpt_list) > 0 +[default4]:AssertionError +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default2]: assert len(self.ckpt_list) > 0 +[default2]:AssertionError +[default0]:Traceback (most recent call last): +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default0]: main() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default0]: return f(*args, **kwargs) +[default0]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default0]: pretrain( +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default0]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default0]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default0]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default0]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default0]: load_path, client_states = self._load_checkpoint(load_dir, +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default0]: sd_loader = SDLoaderFactory.get_sd_loader( +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default0]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default0]: super().__init__(ckpt_list, version, checkpoint_engine) +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default0]: self.check_ckpt_list() +[default0]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default0]: assert len(self.ckpt_list) > 0 +[default0]:AssertionError +[default2]:Traceback (most recent call last): +[default1]:Traceback (most recent call last): +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default1]: main() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default1]: return f(*args, **kwargs) +[default1]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default1]: pretrain( +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]: main() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: return f(*args, **kwargs) +[default4]:Traceback (most recent call last): +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default4]: main() +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default4]: return f(*args, **kwargs) +[default2]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default4]: pretrain( +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: pretrain( +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default2]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default2]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default2]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default2]: load_path, client_states = self._load_checkpoint(load_dir, +[default4]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default2]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default2]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default4]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default4]: load_path, client_states = self._load_checkpoint(load_dir, +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default4]: sd_loader = SDLoaderFactory.get_sd_loader( +[default2]: super().__init__(ckpt_list, version, checkpoint_engine) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]:Traceback (most recent call last): +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default3]: main() +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default1]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default1]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default1]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default2]: self.check_ckpt_list() +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default2]: assert len(self.ckpt_list) > 0 +[default3]: return f(*args, **kwargs) +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default2]:AssertionError +[default1]: load_path, client_states = self._load_checkpoint(load_dir, +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default1]: sd_loader = SDLoaderFactory.get_sd_loader( +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default1]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: pretrain( +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default3]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default4]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default4]: super().__init__(ckpt_list, version, checkpoint_engine) +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default4]: self.check_ckpt_list() +[default1]: super().__init__(ckpt_list, version, checkpoint_engine) +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default1]: self.check_ckpt_list() +[default1]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default1]: assert len(self.ckpt_list) > 0 +[default1]:AssertionError +[default4]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default4]: assert len(self.ckpt_list) > 0 +[default4]:AssertionError +[default3]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]:Traceback (most recent call last): +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default3]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default3]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default3]: load_path, client_states = self._load_checkpoint(load_dir, +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default3]: sd_loader = SDLoaderFactory.get_sd_loader( +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default3]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default3]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default6]: return f(*args, **kwargs) +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default3]: self.check_ckpt_list() +[default3]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default3]: assert len(self.ckpt_list) > 0 +[default3]:AssertionError +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default7]: assert len(self.ckpt_list) > 0 +[default7]:AssertionError +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default6]: assert len(self.ckpt_list) > 0 +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]:AssertionError +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default5]: assert len(self.ckpt_list) > 0 +[default5]:AssertionError +[default6]:Traceback (most recent call last): +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default7]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: assert len(self.ckpt_list) > 0 +[default7]:AssertionError +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default5]: assert len(self.ckpt_list) > 0 +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]:AssertionError +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default6]: assert len(self.ckpt_list) > 0 +[default6]:AssertionError +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default6]: assert len(self.ckpt_list) > 0 +[default6]:AssertionError +[default5]:Traceback (most recent call last): +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]:Traceback (most recent call last): +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default7]: main() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default7]: return f(*args, **kwargs) +[default7]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default7]: pretrain( +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default7]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default7]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default7]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default7]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default7]: load_path, client_states = self._load_checkpoint(load_dir, +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default7]: assert len(self.ckpt_list) > 0 +[default7]:AssertionError +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: sd_loader = SDLoaderFactory.get_sd_loader( +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default7]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default7]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default7]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default5]: assert len(self.ckpt_list) > 0 +[default5]:AssertionError +[default7]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default7]: assert len(self.ckpt_list) > 0 +[default7]:AssertionError +[default6]:Traceback (most recent call last): +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default6]: main() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default6]: return f(*args, **kwargs) +[default6]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default6]: pretrain( +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default6]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default6]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default6]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default6]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default6]: load_path, client_states = self._load_checkpoint(load_dir, +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default6]: sd_loader = SDLoaderFactory.get_sd_loader( +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default6]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default6]: super().__init__(ckpt_list, version, checkpoint_engine) +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default6]: self.check_ckpt_list() +[default6]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default6]: assert len(self.ckpt_list) > 0 +[default6]:AssertionError +[default5]:Traceback (most recent call last): +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 207, in +[default5]: main() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper +[default5]: return f(*args, **kwargs) +[default5]: File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main +[default5]: pretrain( +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain +[default5]: model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +[default5]: args.iteration = load_checkpoint(model, optimizer, lr_scheduler) +[default5]: File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint +[default5]: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint +[default5]: load_path, client_states = self._load_checkpoint(load_dir, +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint +[default5]: sd_loader = SDLoaderFactory.get_sd_loader( +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader +[default5]: return MegatronSDLoader(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ +[default5]: super().__init__(ckpt_list, version, checkpoint_engine) +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ +[default5]: self.check_ckpt_list() +[default5]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list +[default5]: assert len(self.ckpt_list) > 0 +[default5]:AssertionError +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 283032) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3541457) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3157127) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 693387) of binary: /gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/bin/python +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam15-ib0 + rank : 9 (local_rank: 1) + exitcode : 1 (pid: 693388) + error_file: /tmp/torchelastic_iccl4_lk/none_2hsu8qcp/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[2]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam15-ib0 + rank : 10 (local_rank: 2) + exitcode : 1 (pid: 693389) + error_file: /tmp/torchelastic_iccl4_lk/none_2hsu8qcp/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[3]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam15-ib0 + rank : 11 (local_rank: 3) + exitcode : 1 (pid: 693390) + error_file: /tmp/torchelastic_iccl4_lk/none_2hsu8qcp/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[4]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam15-ib0 + rank : 12 (local_rank: 4) + exitcode : 1 (pid: 693391) + error_file: /tmp/torchelastic_iccl4_lk/none_2hsu8qcp/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[5]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam15-ib0 + rank : 13 (local_rank: 5) + exitcode : 1 (pid: 693392) + error_file: /tmp/torchelastic_iccl4_lk/none_2hsu8qcp/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[6]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam15-ib0 + rank : 14 (local_rank: 6) + exitcode : 1 (pid: 693393) + error_file: /tmp/torchelastic_iccl4_lk/none_2hsu8qcp/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[7]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam15-ib0 + rank : 15 (local_rank: 7) + exitcode : 1 (pid: 693394) + error_file: /tmp/torchelastic_iccl4_lk/none_2hsu8qcp/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam15-ib0 + rank : 8 (local_rank: 0) + exitcode : 1 (pid: 693387) + error_file: /tmp/torchelastic_iccl4_lk/none_2hsu8qcp/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +============================================================ +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + return _run_code(code, main_globals, None, + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in +Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 194, in _run_module_as_main + main() + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return _run_code(code, main_globals, None, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/runpy.py", line 87, in _run_code + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + exec(code, run_globals) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 765, in + raise ChildFailedError( + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam16-ib0 + rank : 17 (local_rank: 1) + exitcode : 1 (pid: 3541458) + error_file: /tmp/torchelastic_9c9p1lgx/none_cqhvc03q/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + raise ChildFailedError( + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[2]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam16-ib0 + rank : 18 (local_rank: 2) + exitcode : 1 (pid: 3541459) + error_file: /tmp/torchelastic_9c9p1lgx/none_cqhvc03q/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam14-ib0 + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 283033) + error_file: /tmp/torchelastic_7ozho6cg/none_6lfuso11/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[3]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam16-ib0 + rank : 19 (local_rank: 3) + exitcode : 1 (pid: 3541460) + error_file: /tmp/torchelastic_9c9p1lgx/none_cqhvc03q/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[4]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam16-ib0 + rank : 20 (local_rank: 4) + exitcode : 1 (pid: 3541461) + error_file: /tmp/torchelastic_9c9p1lgx/none_cqhvc03q/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[5]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam16-ib0 + rank : 21 (local_rank: 5) + exitcode : 1 (pid: 3541462) + error_file: /tmp/torchelastic_9c9p1lgx/none_cqhvc03q/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[6]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam16-ib0 + rank : 22 (local_rank: 6) + exitcode : 1 (pid: 3541463) + error_file: /tmp/torchelastic_9c9p1lgx/none_cqhvc03q/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[7]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam16-ib0 + rank : 23 (local_rank: 7) + exitcode : 1 (pid: 3541464) + error_file: /tmp/torchelastic_9c9p1lgx/none_cqhvc03q/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam16-ib0 + rank : 16 (local_rank: 0) + exitcode : 1 (pid: 3541457) + error_file: /tmp/torchelastic_9c9p1lgx/none_cqhvc03q/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +============================================================ + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[2]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam14-ib0 + rank : 2 (local_rank: 2) + exitcode : 1 (pid: 283034) + error_file: /tmp/torchelastic_7ozho6cg/none_6lfuso11/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[3]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam14-ib0 + rank : 3 (local_rank: 3) + exitcode : 1 (pid: 283035) + error_file: /tmp/torchelastic_7ozho6cg/none_6lfuso11/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[4]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam14-ib0 + rank : 4 (local_rank: 4) + exitcode : 1 (pid: 283036) + error_file: /tmp/torchelastic_7ozho6cg/none_6lfuso11/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[5]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam14-ib0 + rank : 5 (local_rank: 5) + exitcode : 1 (pid: 283037) + error_file: /tmp/torchelastic_7ozho6cg/none_6lfuso11/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[6]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam14-ib0 + rank : 6 (local_rank: 6) + exitcode : 1 (pid: 283038) + error_file: /tmp/torchelastic_7ozho6cg/none_6lfuso11/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[7]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam14-ib0 + rank : 7 (local_rank: 7) + exitcode : 1 (pid: 283039) + error_file: /tmp/torchelastic_7ozho6cg/none_6lfuso11/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam14-ib0 + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 283032) + error_file: /tmp/torchelastic_7ozho6cg/none_6lfuso11/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +============================================================ + main() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 761, in main + run(args) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/run.py", line 752, in run + elastic_launch( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam17-ib0 + rank : 25 (local_rank: 1) + exitcode : 1 (pid: 3157128) + error_file: /tmp/torchelastic_gjgpa4i1/none_7ily58ht/attempt_0/1/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[2]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam17-ib0 + rank : 26 (local_rank: 2) + exitcode : 1 (pid: 3157129) + error_file: /tmp/torchelastic_gjgpa4i1/none_7ily58ht/attempt_0/2/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[3]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam17-ib0 + rank : 27 (local_rank: 3) + exitcode : 1 (pid: 3157130) + error_file: /tmp/torchelastic_gjgpa4i1/none_7ily58ht/attempt_0/3/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[4]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam17-ib0 + rank : 28 (local_rank: 4) + exitcode : 1 (pid: 3157131) + error_file: /tmp/torchelastic_gjgpa4i1/none_7ily58ht/attempt_0/4/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[5]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam17-ib0 + rank : 29 (local_rank: 5) + exitcode : 1 (pid: 3157132) + error_file: /tmp/torchelastic_gjgpa4i1/none_7ily58ht/attempt_0/5/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[6]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam17-ib0 + rank : 30 (local_rank: 6) + exitcode : 1 (pid: 3157133) + error_file: /tmp/torchelastic_gjgpa4i1/none_7ily58ht/attempt_0/6/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +[7]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam17-ib0 + rank : 31 (local_rank: 7) + exitcode : 1 (pid: 3157134) + error_file: /tmp/torchelastic_gjgpa4i1/none_7ily58ht/attempt_0/7/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2022-10-06_12:37:50 + host : jean-zay-iam17-ib0 + rank : 24 (local_rank: 0) + exitcode : 1 (pid: 3157127) + error_file: /tmp/torchelastic_gjgpa4i1/none_7ily58ht/attempt_0/0/error.json + traceback : Traceback (most recent call last): + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper + return f(*args, **kwargs) + File "/gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/finetune_t0.py", line 199, in main + pretrain( + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 141, in pretrain + model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/training.py", line 450, in setup_model_and_optimizer + args.iteration = load_checkpoint(model, optimizer, lr_scheduler) + File "/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/checkpointing.py", line 278, in load_checkpoint + loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_module_only=not load_optimizer_states, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2574, in load_checkpoint + load_path, client_states = self._load_checkpoint(load_dir, + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2608, in _load_checkpoint + sd_loader = SDLoaderFactory.get_sd_loader( + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 44, in get_sd_loader + return MegatronSDLoader(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 216, in __init__ + super().__init__(ckpt_list, version, checkpoint_engine) + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 56, in __init__ + self.check_ckpt_list() + File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed/runtime/state_dict_factory.py", line 179, in check_ckpt_list + assert len(self.ckpt_list) > 0 + AssertionError + +============================================================ +srun: error: jean-zay-iam15: task 1: Exited with exit code 1 +srun: launch/slurm: _step_signal: Terminating StepId=2075256.0 +srun: error: jean-zay-iam16: task 2: Exited with exit code 1 +srun: error: jean-zay-iam14: task 0: Exited with exit code 1 +srun: error: jean-zay-iam17: task 3: Exited with exit code 1 +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:using world size: 32, data-parallel-size: 32, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 32 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2075274.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 10 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 4096 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 1024 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 1024 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/kill-switch-tr13e-350M-mtf +[default0]: kv_channels ..................................... 64 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... True +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 1 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. True +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 2 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 1 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/tr13e-350M-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 32 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 32 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-06 12:39:32,326] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default7]:> setting tensorboard ... +[default0]:> initializing tensor model parallel with size 1 +[default0]:> initializing pipeline model parallel with size 1 +[default0]:> setting random seeds to 42 ... +[default0]:[2022-10-06 12:39:34,526] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 +[default0]:> compiling dataset index builder ... +[default0]:make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/data' +[default0]:make: Nothing to be done for 'default'. +[default0]:make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/data' +[default0]:>>> done with dataset index builder. Compilation time: 0.053 seconds +[default0]:> compiling and loading fused kernels ... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module fused_mix_prec_layer_norm_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module fused_mix_prec_layer_norm_cuda... +[default0]:>>> done with compiling and loading fused kernels. Compilation time: 5.946 seconds +[default0]:time to initialize megatron (seconds): -19.473 +[default0]:[after megatron is initialized] datetime: 2022-10-06 12:39:40 +[default0]:building GPT model ... +[default0]:[2022-10-06 12:39:40,576] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2022-10-06 12:39:40,577] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +[default0]:[2022-10-06 12:39:40,577] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.26 GB, percent = 6.6% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pipe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31} +[default0]:[2022-10-06 12:39:41,543] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer|embedding +[default0]:stage=0 layers=31 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: ParallelTransformerLayerPipe +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: undo +[default0]: 28: MixedFusedLayerNorm +[default0]: 29: EmbeddingPipe +[default0]: 30: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2022-10-06 12:39:41,647] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2022-10-06 12:39:41,647] [INFO] [utils.py:828:see_memory_usage] MA 1.04 GB Max_MA 1.04 GB CA 1.09 GB Max_CA 1 GB +[default0]:[2022-10-06 12:39:41,647] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.37 GB, percent = 6.6% +[default0]:setting training iterations to 6200 +[default0]:> learning rate decay style: constant +[default0]:DeepSpeed is enabled. +[default0]:[2022-10-06 12:39:41,649] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.1+8b2a6371, git-hash=8b2a6371, git-branch=master +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-06 12:39:43,875] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[default0]:[2022-10-06 12:39:43,876] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +[default0]:[2022-10-06 12:39:43,876] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[default0]:[2022-10-06 12:39:43,883] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +[default0]:[2022-10-06 12:39:43,883] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[default0]:[2022-10-06 12:39:43,883] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer +[default0]:[2022-10-06 12:39:43,883] [INFO] [stage_1_and_2.py:134:__init__] Reduce bucket size 500000000 +[default0]:[2022-10-06 12:39:43,883] [INFO] [stage_1_and_2.py:135:__init__] Allgather bucket size 500000000 +[default0]:[2022-10-06 12:39:43,883] [INFO] [stage_1_and_2.py:136:__init__] CPU Offload: False +[default0]:[2022-10-06 12:39:43,883] [INFO] [stage_1_and_2.py:137:__init__] Round robin gradient partitioning: False +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Emitting ninja build file /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113/utils/build.ninja... +[default6]:Building extension module utils... +[default6]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default6]:ninja: no work to do. +[default6]:Loading extension module utils... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.23503875732421875 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.31128740310668945 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.23517680168151855 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.23528623580932617 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.23569703102111816 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.23508214950561523 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.23584365844726562 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.31245923042297363 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.2351682186126709 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.2302863597869873 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.23518705368041992 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.23015379905700684 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.23030304908752441 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.3096144199371338 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.30896997451782227 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.30999755859375 seconds +[default5]:Loading extension module utils... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.30806994438171387 seconds +[default5]:Time to load utils op: 0.30911779403686523 seconds +[default3]:Loading extension module utils... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.3088819980621338 seconds +[default3]:Time to load utils op: 0.30885910987854004 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.22997784614562988 seconds +[default7]:Loading extension module utils... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.23024249076843262 seconds +[default7]:Time to load utils op: 0.22997713088989258 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.2301778793334961 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.2303464412689209 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.3072690963745117 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.3089137077331543 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.3086559772491455 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.3106975555419922 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.31142663955688477 seconds +[default6]:Time to load utils op: 0.23461270332336426 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.3096287250518799 seconds +[default5]:Rank: 29 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 28 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 6 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 15 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 14 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 7 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 3 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 19 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 2 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 10 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 11 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 16 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 18 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 17 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 20 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 22 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 21 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 23 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 5 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 4 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 30 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 31 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 24 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 25 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 9 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 8 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 13 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 12 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 1 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 0 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0034241676330566406 seconds +[default3]:Rank: 27 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.004215717315673828 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0031180381774902344 seconds +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.003159761428833008 seconds +[default6]:Time to load utils op: 0.0034897327423095703 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0035400390625 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0035576820373535156 seconds +[default2]:Rank: 26 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0032913684844970703 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0030515193939208984 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0037369728088378906 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0030100345611572266 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.003242969512939453 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0034415721893310547 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.003824472427368164 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.003319978713989258 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0035796165466308594 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0034034252166748047 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.003475666046142578 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0033316612243652344 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.004266262054443359 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.002855062484741211 seconds +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0030517578125 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0029702186584472656 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0029757022857666016 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0033731460571289062 seconds +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0034286975860595703 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0032584667205810547 seconds +[default6]:Time to load utils op: 0.003114461898803711 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0036439895629882812 seconds +[default3]:Time to load utils op: 0.0034182071685791016 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.003915309906005859 seconds +[default0]:[2022-10-06 12:39:48,207] [INFO] [utils.py:827:see_memory_usage] Before initializing optimizer states +[default0]:[2022-10-06 12:39:48,208] [INFO] [utils.py:828:see_memory_usage] MA 1.11 GB Max_MA 1.12 GB CA 1.84 GB Max_CA 2 GB +[default0]:[2022-10-06 12:39:48,208] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.32 GB, percent = 7.2% +[default0]:[2022-10-06 12:39:48,247] [INFO] [utils.py:827:see_memory_usage] After initializing optimizer states +[default0]:[2022-10-06 12:39:48,248] [INFO] [utils.py:828:see_memory_usage] MA 1.24 GB Max_MA 1.3 GB CA 1.89 GB Max_CA 2 GB +[default0]:[2022-10-06 12:39:48,248] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.32 GB, percent = 7.2% +[default0]:[2022-10-06 12:39:48,248] [INFO] [stage_1_and_2.py:516:__init__] optimizer state initialized +[default0]:[2022-10-06 12:39:48,276] [INFO] [utils.py:827:see_memory_usage] After initializing ZeRO optimizer +[default0]:[2022-10-06 12:39:48,276] [INFO] [utils.py:828:see_memory_usage] MA 1.24 GB Max_MA 1.24 GB CA 1.89 GB Max_CA 2 GB +[default0]:[2022-10-06 12:39:48,276] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.32 GB, percent = 7.2% +[default0]:[2022-10-06 12:39:48,276] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[default0]:[2022-10-06 12:39:48,276] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[default0]:[2022-10-06 12:39:48,276] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[default0]:[2022-10-06 12:39:48,276] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:987:print] DeepSpeedEngine configuration: +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] activation_checkpointing_config { +[default0]: "partition_activations": false, +[default0]: "contiguous_memory_optimization": false, +[default0]: "cpu_checkpointing": false, +[default0]: "number_checkpoints": null, +[default0]: "synchronize_checkpoint_boundary": false, +[default0]: "profile": false +[default0]:} +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] amp_enabled .................. False +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] amp_params ................... False +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] autotuning_config ............ { +[default0]: "enabled": false, +[default0]: "start_step": null, +[default0]: "end_step": null, +[default0]: "metric_path": null, +[default0]: "arg_mappings": null, +[default0]: "metric": "throughput", +[default0]: "model_info": null, +[default0]: "results_dir": null, +[default0]: "exps_dir": null, +[default0]: "overwrite": true, +[default0]: "fast": true, +[default0]: "start_profile_step": 3, +[default0]: "end_profile_step": 5, +[default0]: "tuner_type": "gridsearch", +[default0]: "tuner_early_stopping": 5, +[default0]: "tuner_num_trials": 50, +[default0]: "model_info_path": null, +[default0]: "mp_size": 1, +[default0]: "max_train_batch_size": null, +[default0]: "min_train_batch_size": 1, +[default0]: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +[default0]: "min_train_micro_batch_size_per_gpu": 1, +[default0]: "num_tuning_micro_batch_sizes": 3 +[default0]:} +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] bfloat16_enabled ............. False +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] checkpoint_tag_validation_enabled True +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] checkpoint_tag_validation_fail False +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] comms_config ................. +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] communication_data_type ...... None +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] curriculum_enabled ........... False +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] curriculum_params ............ False +[default0]:[2022-10-06 12:39:48,277] [INFO] [config.py:991:print] dataloader_drop_last ......... False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] disable_allgather ............ False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] dump_state ................... False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] eigenvalue_enabled ........... False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] eigenvalue_gas_boundary_resolution 1 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] eigenvalue_layer_name ........ bert.encoder.layer +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] eigenvalue_layer_num ......... 0 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] eigenvalue_max_iter .......... 100 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] eigenvalue_stability ......... 1e-06 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] eigenvalue_tol ............... 0.01 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] eigenvalue_verbose ........... False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] elasticity_enabled ........... False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] flops_profiler_config ........ { +[default0]: "enabled": false, +[default0]: "profile_step": 1, +[default0]: "module_depth": -1, +[default0]: "top_modules": 1, +[default0]: "detailed": true, +[default0]: "output_file": null +[default0]:} +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] fp16_auto_cast ............... False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] fp16_enabled ................. True +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] fp16_master_weights_and_gradients False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] global_rank .................. 0 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] gradient_accumulation_steps .. 32 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] gradient_clipping ............ 1.0 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] gradient_predivide_factor .... 1.0 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] initial_dynamic_scale ........ 4096 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] load_universal_checkpoint .... False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] loss_scale ................... 0 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] memory_breakdown ............. False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] monitor_config ............... +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] nebula_config ................ { +[default0]: "enabled": false, +[default0]: "persistent_storage_path": null, +[default0]: "persistent_time_interval": 100, +[default0]: "num_of_version_in_retention": 2, +[default0]: "enable_nebula_load": true, +[default0]: "load_path": null +[default0]:} +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] optimizer_legacy_fusion ...... False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] optimizer_name ............... None +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] optimizer_params ............. None +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] pld_enabled .................. False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] pld_params ................... False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] prescale_gradients ........... False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] scheduler_name ............... None +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] scheduler_params ............. None +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] sparse_attention ............. None +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] sparse_gradients_enabled ..... False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] steps_per_print .............. 2000 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] train_batch_size ............. 1024 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] train_micro_batch_size_per_gpu 1 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] wall_clock_breakdown ......... False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] world_size ................... 32 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] zero_allow_untested_optimizer False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] zero_enabled ................. True +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:991:print] zero_optimization_stage ...... 1 +[default0]:[2022-10-06 12:39:48,278] [INFO] [config.py:976:print_user_config] json = { +[default0]: "train_micro_batch_size_per_gpu": 1, +[default0]: "train_batch_size": 1.024000e+03, +[default0]: "gradient_clipping": 1.0, +[default0]: "zero_optimization": { +[default0]: "stage": 1 +[default0]: }, +[default0]: "fp16": { +[default0]: "enabled": true, +[default0]: "loss_scale": 0, +[default0]: "loss_scale_window": 500, +[default0]: "hysteresis": 2, +[default0]: "min_loss_scale": 1, +[default0]: "initial_scale_power": 12 +[default0]: }, +[default0]: "steps_per_print": 2.000000e+03, +[default0]: "wall_clock_breakdown": false +[default0]:} +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0004591941833496094 seconds +[default0]:[2022-10-06 12:39:48,279] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=32 micro_batch_size=1 +[default3]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:39:48,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:39:48,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:39:48,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:39:48,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:39:48,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,416] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=31 [0, 31) STAGE_PARAMS=559214592 (559.215M) TOTAL_PARAMS=559214592 (559.215M) UNIQUE_PARAMS=559214592 (559.215M) +[default0]:[2022-10-06 12:39:48,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:39:48,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:39:48,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:39:48,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:39:48,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:39:48,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:39:48,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:39:48,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:39:48,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:39:48,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:39:48,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:39:48,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:39:48,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:39:48,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:39:48,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:39:48,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:39:48,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:39:48,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:39:48,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:48,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:48,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:48,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:48,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:48,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:48,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:48,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:48,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:48,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:48,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:48,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:48,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:48,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:48,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:48,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,032] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,033] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,117] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,330] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,318] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,330] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,416] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,880] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,815] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,924] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-06 12:39:49,925] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-06 12:39:49,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,960] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-06 12:39:49,960] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-06 12:39:49,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:50,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:50,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,965] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-06 12:39:49,966] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-06 12:39:49,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,961] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-06 12:39:49,962] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-06 12:39:49,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,966] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-06 12:39:49,966] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-06 12:39:49,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:50,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:50,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:50,005] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-06 12:39:50,005] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-06 12:39:49,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:50,000] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-06 12:39:50,001] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-06 12:39:49,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:50,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:50,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,980] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-06 12:39:49,980] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-06 12:39:49,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,926] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-06 12:39:49,927] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-06 12:39:49,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:49,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:49,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:49,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:49,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:49,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:49,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:49,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:49,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,004] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-06 12:39:50,004] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-06 12:39:49,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:49,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:49,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:49,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:50,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:49,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:50,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:50,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:50,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:50,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:50,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:50,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:50,024] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-06 12:39:50,024] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-06 12:39:49,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:49,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:49,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:50,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:50,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:50,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:49,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:49,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:50,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:50,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:50,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,025] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-06 12:39:50,026] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-06 12:39:50,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,097] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-06 12:39:50,097] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-06 12:39:50,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,088] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-06 12:39:50,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,088] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp[default0]:[2022-10-06 12:39:50,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_[default0]:[2022-10-06 12:39:50,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/s[default0]:[2022-10-06 12:39:50,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +ix/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/x[default0]:[2022-10-06 12:39:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +p3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp[default0]:[2022-10-06 12:39:50,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-06 12:39:50,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:50,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:50,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:50,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:50,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:50,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:50,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:50,042] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default4]:[2022-10-06 12:39:50,042] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-06 12:39:50,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:50,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:50,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:50,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:50,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:50,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:50,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:50,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:50,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:50,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:50,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:50,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:50,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:50,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:50,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:39:50,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:39:50,106] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default6]:[2022-10-06 12:39:50,106] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-06 12:39:50,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,105] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-06 12:39:50,106] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-06 12:39:50,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,027] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-06 12:39:50,027] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-06 12:39:50,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,105] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-06 12:39:50,105] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-06 12:39:50,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,033] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,040] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-06 12:39:50,041] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-06 12:39:50,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,120] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-06 12:39:50,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,033] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:39:50,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:39:50,100] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default2]:[2022-10-06 12:39:50,100] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-06 12:39:50,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,117] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,071] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-06 12:39:50,072] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-06 12:39:50,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:50,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:50,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:50,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:50,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:50,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:39:50,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:50,029] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-06 12:39:50,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:50,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:50,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:39:50,030] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-06 12:39:50,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:50,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:50,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:39:50,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:39:50,112] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default3]:[2022-10-06 12:39:50,113] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default7]:time (ms) | load-checkpoint: 1775.47 +[default0]:[2022-10-06 12:39:50,117] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,195] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-06 12:39:50,195] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default0]:could not find arguments in the checkpoint ... +[default0]: checkpoint version 3.0 +[default0]: successfully loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq at iteration 0 +[default1]:[2022-10-06 12:39:50,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:39:50,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:39:50,192] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default1]:[2022-10-06 12:39:50,193] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-06 12:39:50,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,150] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-06 12:39:50,151] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-06 12:39:50,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,173] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-06 12:39:50,174] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-06 12:39:50,120] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-06 12:39:50,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:39:50,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:39:50,171] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default0]:[2022-10-06 12:39:50,172] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-06 12:39:50,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:39:50,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:39:50,131] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default7]:[2022-10-06 12:39:50,131] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-06 12:39:50,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:39:50,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:39:50,180] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default5]:[2022-10-06 12:39:50,180] [WARNING] [engine.py:2810:_get_all_zero_checkpoint_names] The following zero checkpoints paths are missing: ['/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt', '/gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step0/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt'] +[default0]:estimated model parameters: 0.559214592 +[default0]:estimated model parameters without embeddings: 0.302313472 +[default0]:[after model, optimizer, and learning rate scheduler are built] datetime: 2022-10-06 12:39:50 +[default0]:> building train, validation, and test datasets ... +[default0]: > datasets target sizes (minimum size): +[default0]: train: 6348800 +[default0]: validation: 512000 +[default0]: test: 10240 +[default0]:> building train, validation, and test datasets for T0 ... +[default0]: > building dataset index ... +[default0]:/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/utils.py:365: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +[default0]: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.434334 seconds +[default0]: number of documents: 31495184 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 29920425) total of 29920425 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.486903 seconds +[default0]: number of documents: 31495184 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002378 seconds +[default0]: number of documents: 31495184 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_train_indexmap_2470317ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_train_indexmap_2470317ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.094 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.140138 seconds +[default0]: number of documents: 5151349 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 4893782) total of 4893782 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.122352 seconds +[default0]: number of documents: 5151349 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007634 seconds +[default0]: number of documents: 5151349 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_train_indexmap_501992ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_train_indexmap_501992ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.119 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.086666 seconds +[default0]: number of documents: 3562772 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 3384633) total of 3384633 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.134089 seconds +[default0]: number of documents: 3562772 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006529 seconds +[default0]: number of documents: 3562772 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_train_indexmap_406376ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_train_indexmap_406376ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.106 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.124998 seconds +[default0]: number of documents: 2707724 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2572338) total of 2572338 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.091258 seconds +[default0]: number of documents: 2707724 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003654 seconds +[default0]: number of documents: 2707724 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_train_indexmap_373254ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_train_indexmap_373254ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.127 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.102106 seconds +[default0]: number of documents: 5055942 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 4803145) total of 4803145 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.111763 seconds +[default0]: number of documents: 5055942 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004692 seconds +[default0]: number of documents: 5055942 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_train_indexmap_367732ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_train_indexmap_367732ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.107 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.099258 seconds +[default0]: number of documents: 2148955 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2041507) total of 2041507 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.087903 seconds +[default0]: number of documents: 2148955 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005252 seconds +[default0]: number of documents: 2148955 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_train_indexmap_310091ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_train_indexmap_310091ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.084 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.077302 seconds +[default0]: number of documents: 2627392 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2496022) total of 2496022 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.085037 seconds +[default0]: number of documents: 2627392 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004709 seconds +[default0]: number of documents: 2627392 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_train_indexmap_305386ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_train_indexmap_305386ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.084 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.118495 seconds +[default0]: number of documents: 3560556 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 3382528) total of 3382528 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.159408 seconds +[default0]: number of documents: 3560556 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004444 seconds +[default0]: number of documents: 3560556 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_train_indexmap_304299ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_train_indexmap_304299ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.089 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.070270 seconds +[default0]: number of documents: 1543441 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1466269) total of 1466269 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.077782 seconds +[default0]: number of documents: 1543441 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004042 seconds +[default0]: number of documents: 1543441 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_train_indexmap_291291ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_train_indexmap_291291ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.089 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.080380 seconds +[default0]: number of documents: 1667306 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1583941) total of 1583941 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.105832 seconds +[default0]: number of documents: 1667306 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005380 seconds +[default0]: number of documents: 1667306 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_train_indexmap_205616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_train_indexmap_205616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.078 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.078125 seconds +[default0]: number of documents: 855756 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 812968) total of 812968 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.068275 seconds +[default0]: number of documents: 855756 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002969 seconds +[default0]: number of documents: 855756 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_train_indexmap_127177ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_train_indexmap_127177ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.070 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.071640 seconds +[default0]: number of documents: 573364 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 544696) total of 544696 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.057559 seconds +[default0]: number of documents: 573364 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001532 seconds +[default0]: number of documents: 573364 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_train_indexmap_88369ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_train_indexmap_88369ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.072 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.040143 seconds +[default0]: number of documents: 410633 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 390101) total of 390101 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051921 seconds +[default0]: number of documents: 410633 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001288 seconds +[default0]: number of documents: 410633 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_train_indexmap_61292ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_train_indexmap_61292ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.054 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.048732 seconds +[default0]: number of documents: 428843 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 407401) total of 407401 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.046574 seconds +[default0]: number of documents: 428843 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001490 seconds +[default0]: number of documents: 428843 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_train_indexmap_55208ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_train_indexmap_55208ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.072 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.050281 seconds +[default0]: number of documents: 417269 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 396406) total of 396406 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.044992 seconds +[default0]: number of documents: 417269 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001550 seconds +[default0]: number of documents: 417269 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_train_indexmap_44171ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_train_indexmap_44171ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.062 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.064853 seconds +[default0]: number of documents: 1114455 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1058732) total of 1058732 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.056792 seconds +[default0]: number of documents: 1114455 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012902 seconds +[default0]: number of documents: 1114455 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_train_indexmap_37186ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_train_indexmap_37186ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.061 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.044628 seconds +[default0]: number of documents: 347499 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 330124) total of 330124 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045360 seconds +[default0]: number of documents: 347499 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001245 seconds +[default0]: number of documents: 347499 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_train_indexmap_37173ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_train_indexmap_37173ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.065 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.036654 seconds +[default0]: number of documents: 339210 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 322250) total of 322250 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.052948 seconds +[default0]: number of documents: 339210 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001138 seconds +[default0]: number of documents: 339210 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_train_indexmap_34691ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_train_indexmap_34691ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.055 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045230 seconds +[default0]: number of documents: 315754 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 299966) total of 299966 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.039656 seconds +[default0]: number of documents: 315754 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000944 seconds +[default0]: number of documents: 315754 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_train_indexmap_26100ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_train_indexmap_26100ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.035 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.052177 seconds +[default0]: number of documents: 918416 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 872495) total of 872495 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.069748 seconds +[default0]: number of documents: 918416 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006507 seconds +[default0]: number of documents: 918416 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_train_indexmap_23389ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_train_indexmap_23389ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.064 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.053708 seconds +[default0]: number of documents: 950097 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 902592) total of 902592 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.052609 seconds +[default0]: number of documents: 950097 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009218 seconds +[default0]: number of documents: 950097 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_train_indexmap_21563ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_train_indexmap_21563ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.058 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049968 seconds +[default0]: number of documents: 915063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869310) total of 869310 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.061530 seconds +[default0]: number of documents: 915063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004694 seconds +[default0]: number of documents: 915063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_train_indexmap_18042ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_train_indexmap_18042ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.055 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.053429 seconds +[default0]: number of documents: 915061 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869308) total of 869308 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.068257 seconds +[default0]: number of documents: 915061 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005713 seconds +[default0]: number of documents: 915061 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_train_indexmap_17484ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_train_indexmap_17484ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.069 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.058548 seconds +[default0]: number of documents: 915058 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869305) total of 869305 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.052531 seconds +[default0]: number of documents: 915058 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007695 seconds +[default0]: number of documents: 915058 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_train_indexmap_16885ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_train_indexmap_16885ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.056 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.084698 seconds +[default0]: number of documents: 865056 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 821803) total of 821803 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.047243 seconds +[default0]: number of documents: 865056 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006593 seconds +[default0]: number of documents: 865056 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_train_indexmap_16740ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_train_indexmap_16740ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.057 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.070975 seconds +[default0]: number of documents: 915044 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869292) total of 869292 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.066841 seconds +[default0]: number of documents: 915044 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002204 seconds +[default0]: number of documents: 915044 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_train_indexmap_16592ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_train_indexmap_16592ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.090 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051392 seconds +[default0]: number of documents: 915043 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869291) total of 869291 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.058686 seconds +[default0]: number of documents: 915043 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007593 seconds +[default0]: number of documents: 915043 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_train_indexmap_16532ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_train_indexmap_16532ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.069 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051769 seconds +[default0]: number of documents: 915021 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869270) total of 869270 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.082639 seconds +[default0]: number of documents: 915021 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003037 seconds +[default0]: number of documents: 915021 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_train_indexmap_15642ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_train_indexmap_15642ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.052 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051316 seconds +[default0]: number of documents: 915054 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869301) total of 869301 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.059736 seconds +[default0]: number of documents: 915054 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002105 seconds +[default0]: number of documents: 915054 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_train_indexmap_15616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_train_indexmap_15616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.055 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.056296 seconds +[default0]: number of documents: 915051 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869298) total of 869298 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051739 seconds +[default0]: number of documents: 915051 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006719 seconds +[default0]: number of documents: 915051 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_train_indexmap_15230ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_train_indexmap_15230ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.049 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038193 seconds +[default0]: number of documents: 318189 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 302280) total of 302280 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.036268 seconds +[default0]: number of documents: 318189 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001109 seconds +[default0]: number of documents: 318189 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_train_indexmap_12795ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_train_indexmap_12795ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.044 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.042064 seconds +[default0]: number of documents: 265864 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 252571) total of 252571 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.040792 seconds +[default0]: number of documents: 265864 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001268 seconds +[default0]: number of documents: 265864 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_train_indexmap_11605ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_train_indexmap_11605ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.031 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.035176 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038399 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000919 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_train_indexmap_10970ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_train_indexmap_10970ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.036 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.040671 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.034279 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001038 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_train_indexmap_10706ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_train_indexmap_10706ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.031 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.078144 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.070293 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001040 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_train_indexmap_10360ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_train_indexmap_10360ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.043 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.059215 seconds +[default0]: number of documents: 365060 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 346807) total of 346807 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.039441 seconds +[default0]: number of documents: 365060 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001272 seconds +[default0]: number of documents: 365060 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_train_indexmap_8374ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_train_indexmap_8374ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.034 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.042349 seconds +[default0]: number of documents: 365063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 346810) total of 346810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.035520 seconds +[default0]: number of documents: 365063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001196 seconds +[default0]: number of documents: 365063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_train_indexmap_8126ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_train_indexmap_8126ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.038 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.036293 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.035643 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000846 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_train_indexmap_7693ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_train_indexmap_7693ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.032 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.034508 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251921) total of 251921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.043159 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001168 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_train_indexmap_7627ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_train_indexmap_7627ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.046 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.034897 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038606 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001220 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_train_indexmap_7564ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_train_indexmap_7564ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.048 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.040910 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033382 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000953 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_train_indexmap_7497ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_train_indexmap_7497ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.024 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.056385 seconds +[default0]: number of documents: 271191 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 257631) total of 257631 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.035714 seconds +[default0]: number of documents: 271191 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001178 seconds +[default0]: number of documents: 271191 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_train_indexmap_7334ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_train_indexmap_7334ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.047 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.040669 seconds +[default0]: number of documents: 269973 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 256474) total of 256474 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.039941 seconds +[default0]: number of documents: 269973 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001143 seconds +[default0]: number of documents: 269973 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_train_indexmap_7168ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_train_indexmap_7168ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.040 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.047715 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251817) total of 251817 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038007 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000927 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_train_indexmap_7167ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_train_indexmap_7167ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.030 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.043960 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251921) total of 251921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038502 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001078 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_train_indexmap_7098ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_train_indexmap_7098ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.045 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.033943 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251817) total of 251817 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.036155 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000887 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_train_indexmap_7047ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_train_indexmap_7047ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.053 seconds +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.387164, achieved: 0.387164 +[default0]: dataset 1, input: 0.0786754, achieved: 0.0786753 +[default0]: dataset 2, input: 0.0636898, achieved: 0.0636898 +[default0]: dataset 3, input: 0.0584986, achieved: 0.0584986 +[default0]: dataset 4, input: 0.0576332, achieved: 0.0576332 +[default0]: dataset 5, input: 0.0485994, achieved: 0.0485994 +[default0]: dataset 6, input: 0.0478619, achieved: 0.0478619 +[default0]: dataset 7, input: 0.0476917, achieved: 0.0476917 +[default0]: dataset 8, input: 0.045653, achieved: 0.045653 +[default0]: dataset 9, input: 0.0322254, achieved: 0.0322254 +[default0]: dataset 10, input: 0.0199319, achieved: 0.0199319 +[default0]: dataset 11, input: 0.0138497, achieved: 0.0138497 +[default0]: dataset 12, input: 0.00960604, achieved: 0.00960602 +[default0]: dataset 13, input: 0.00865243, achieved: 0.00865244 +[default0]: dataset 14, input: 0.00692261, achieved: 0.00692258 +[default0]: dataset 15, input: 0.00582803, achieved: 0.00582806 +[default0]: dataset 16, input: 0.00582586, achieved: 0.00582586 +[default0]: dataset 17, input: 0.00543684, achieved: 0.00543682 +[default0]: dataset 18, input: 0.00409056, achieved: 0.00409057 +[default0]: dataset 19, input: 0.00366562, achieved: 0.00366564 +[default0]: dataset 20, input: 0.00337934, achieved: 0.00337937 +[default0]: dataset 21, input: 0.00282756, achieved: 0.00282753 +[default0]: dataset 22, input: 0.00274012, achieved: 0.00274012 +[default0]: dataset 23, input: 0.00264619, achieved: 0.00264622 +[default0]: dataset 24, input: 0.00262358, achieved: 0.0026236 +[default0]: dataset 25, input: 0.0026003, achieved: 0.00260032 +[default0]: dataset 26, input: 0.00259099, achieved: 0.00259097 +[default0]: dataset 27, input: 0.00245151, achieved: 0.00245155 +[default0]: dataset 28, input: 0.00244735, achieved: 0.00244736 +[default0]: dataset 29, input: 0.00238684, achieved: 0.00238686 +[default0]: dataset 30, input: 0.0020053, achieved: 0.00200525 +[default0]: dataset 31, input: 0.00181876, achieved: 0.00181879 +[default0]: dataset 32, input: 0.00171918, achieved: 0.00171917 +[default0]: dataset 33, input: 0.00167779, achieved: 0.00167776 +[default0]: dataset 34, input: 0.00162359, achieved: 0.00162355 +[default0]: dataset 35, input: 0.00131237, achieved: 0.00131238 +[default0]: dataset 36, input: 0.00127347, achieved: 0.00127344 +[default0]: dataset 37, input: 0.00120564, achieved: 0.00120569 +[default0]: dataset 38, input: 0.00119533, achieved: 0.00119529 +[default0]: dataset 39, input: 0.00118536, achieved: 0.00118536 +[default0]: dataset 40, input: 0.00117488, achieved: 0.00117487 +[default0]: dataset 41, input: 0.00114928, achieved: 0.00114929 +[default0]: dataset 42, input: 0.00112334, achieved: 0.00112334 +[default0]: dataset 43, input: 0.00112318, achieved: 0.00112315 +[default0]: dataset 44, input: 0.00111237, achieved: 0.00111236 +[default0]: dataset 45, input: 0.00110439, achieved: 0.00110444 +[default0]:> elapsed time for building blendable dataset indices: 0.53 (sec) +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.217538 seconds +[default0]: number of documents: 15234080 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [14472376, 15234080) total of 761704 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_17016ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_17016ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_17016ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.099 seconds +[default0]: total number of samples: 221750 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.164266 seconds +[default0]: number of documents: 6142390 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [5835270, 6142390) total of 307120 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_5785ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_5785ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_5785ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.061 seconds +[default0]: total number of samples: 136143 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.210011 seconds +[default0]: number of documents: 26176998 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [24868148, 26176998) total of 1308850 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_67033ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_67033ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_67033ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.145 seconds +[default0]: total number of samples: 432311 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.119616 seconds +[default0]: number of documents: 20844665 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [19802432, 20844665) total of 1042233 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_114084ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_114084ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_114084ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.085 seconds +[default0]: total number of samples: 521545 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.199993 seconds +[default0]: number of documents: 67005817 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [63655526, 67005817) total of 3350291 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_54893ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_54893ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_54893ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.135 seconds +[default0]: total number of samples: 1740321 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.141433 seconds +[default0]: number of documents: 5149795 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4892305, 5149795) total of 257490 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_803ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_803ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_803ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.040 seconds +[default0]: total number of samples: 26370 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.221580 seconds +[default0]: number of documents: 58847091 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [55904736, 58847091) total of 2942355 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_67171ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_67171ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_67171ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.161 seconds +[default0]: total number of samples: 1458654 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.154222 seconds +[default0]: number of documents: 12514253 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [11888540, 12514253) total of 625713 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_5618ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_5618ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_5618ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.073 seconds +[default0]: total number of samples: 134071 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.071387 seconds +[default0]: number of documents: 180608 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [171578, 180608) total of 9030 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_57ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_57ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_57ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: total number of samples: 2501 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.093984 seconds +[default0]: number of documents: 12303134 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [11687977, 12303134) total of 615157 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_2827ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_2827ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_2827ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.059 seconds +[default0]: total number of samples: 157244 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.110994 seconds +[default0]: number of documents: 2033057 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [1931404, 2033057) total of 101653 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_207ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_207ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_207ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.047 seconds +[default0]: total number of samples: 20517 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.167589 seconds +[default0]: number of documents: 26793553 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [25453875, 26793553) total of 1339678 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_3844ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_3844ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_3844ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.086 seconds +[default0]: total number of samples: 101502 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.166464 seconds +[default0]: number of documents: 3155990 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2998190, 3155990) total of 157800 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_319ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_319ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_319ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.033 seconds +[default0]: total number of samples: 44182 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.138534 seconds +[default0]: number of documents: 6692522 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [6357896, 6692522) total of 334626 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_532ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_532ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_532ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.034 seconds +[default0]: total number of samples: 47613 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.109723 seconds +[default0]: number of documents: 3017261 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2866398, 3017261) total of 150863 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_258ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_258ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_258ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: total number of samples: 29298 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.171213 seconds +[default0]: number of documents: 3648041 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [3465639, 3648041) total of 182402 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_344ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_344ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_344ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: total number of samples: 5659 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.091893 seconds +[default0]: number of documents: 4327282 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4110918, 4327282) total of 216364 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_185ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_185ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_185ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.025 seconds +[default0]: total number of samples: 12423 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.156650 seconds +[default0]: number of documents: 2698896 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2563951, 2698896) total of 134945 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_262ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_262ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_262ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.035 seconds +[default0]: total number of samples: 19133 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.137540 seconds +[default0]: number of documents: 12767593 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [12129213, 12767593) total of 638380 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_1088ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_1088ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_1088ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.046 seconds +[default0]: total number of samples: 87928 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.112102 seconds +[default0]: number of documents: 4342323 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4125207, 4342323) total of 217116 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_470ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_470ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_470ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.065 seconds +[default0]: total number of samples: 69780 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.093231 seconds +[default0]: number of documents: 3022722 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2871586, 3022722) total of 151136 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_641ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_641ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_641ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: total number of samples: 22532 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.099101 seconds +[default0]: number of documents: 1162568 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [1104440, 1162568) total of 58128 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_163ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_163ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_163ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.008 seconds +[default0]: total number of samples: 1608 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.209260 seconds +[default0]: number of documents: 55294645 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [52529913, 55294645) total of 2764732 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_41871ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_41871ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_41871ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.129 seconds +[default0]: total number of samples: 690621 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.177409 seconds +[default0]: number of documents: 44855616 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [42612835, 44855616) total of 2242781 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_28453ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_28453ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_28453ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.092 seconds +[default0]: total number of samples: 468689 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.205636 seconds +[default0]: number of documents: 31969891 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [30371396, 31969891) total of 1598495 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_25493ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_25493ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_25493ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.084 seconds +[default0]: total number of samples: 497625 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.123863 seconds +[default0]: number of documents: 34110375 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [32404856, 34110375) total of 1705519 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_12667ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_12667ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_12667ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.107 seconds +[default0]: total number of samples: 125120 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.196895 seconds +[default0]: number of documents: 43761623 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [41573542, 43761623) total of 2188081 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_62220ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_62220ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_62220ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.122 seconds +[default0]: total number of samples: 1010592 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.069725 seconds +[default0]: number of documents: 197602 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [187722, 197602) total of 9880 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_267ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_267ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_267ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.013 seconds +[default0]: total number of samples: 4451 +[default0]: total number of epochs: 1 +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.0330676, achieved: 0.0330676 +[default0]: dataset 1, input: 0.0112421, achieved: 0.0112421 +[default0]: dataset 2, input: 0.130272, achieved: 0.130272 +[default0]: dataset 3, input: 0.221712, achieved: 0.221712 +[default0]: dataset 4, input: 0.106678, achieved: 0.106678 +[default0]: dataset 5, input: 0.00155951, achieved: 0.00155955 +[default0]: dataset 6, input: 0.13054, achieved: 0.13054 +[default0]: dataset 7, input: 0.010918, achieved: 0.0109181 +[default0]: dataset 8, input: 0.000110214, achieved: 0.000110257 +[default0]: dataset 9, input: 0.00549238, achieved: 0.00549235 +[default0]: dataset 10, input: 0.000402122, achieved: 0.000402094 +[default0]: dataset 11, input: 0.00747007, achieved: 0.00747007 +[default0]: dataset 12, input: 0.000619047, achieved: 0.000619024 +[default0]: dataset 13, input: 0.00103353, achieved: 0.0010336 +[default0]: dataset 14, input: 0.000501201, achieved: 0.000501226 +[default0]: dataset 15, input: 0.000667277, achieved: 0.000667231 +[default0]: dataset 16, input: 0.000359281, achieved: 0.000359326 +[default0]: dataset 17, input: 0.000508443, achieved: 0.000508519 +[default0]: dataset 18, input: 0.00211373, achieved: 0.0021138 +[default0]: dataset 19, input: 0.000912995, achieved: 0.000912961 +[default0]: dataset 20, input: 0.00124543, achieved: 0.00124546 +[default0]: dataset 21, input: 0.000315887, achieved: 0.00031594 +[default0]: dataset 22, input: 0.0813721, achieved: 0.0813721 +[default0]: dataset 23, input: 0.0552939, achieved: 0.0552939 +[default0]: dataset 24, input: 0.0495415, achieved: 0.0495414 +[default0]: dataset 25, input: 0.0246164, achieved: 0.0246163 +[default0]: dataset 26, input: 0.120917, achieved: 0.120917 +[default0]: dataset 27, input: 0.000517703, achieved: 0.000517666 +[default0]:> elapsed time for building blendable dataset indices: 0.32 (sec) +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008661 seconds +[default0]: number of documents: 31495184 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [29920425, 31495184) total of 1574759 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007718 seconds +[default0]: number of documents: 31495184 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006597 seconds +[default0]: number of documents: 31495184 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_validation_indexmap_199220ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_validation_indexmap_199220ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.084 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002674 seconds +[default0]: number of documents: 5151349 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [4893782, 5151349) total of 257567 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007398 seconds +[default0]: number of documents: 5151349 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.015025 seconds +[default0]: number of documents: 5151349 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_validation_indexmap_40484ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_validation_indexmap_40484ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.042 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002715 seconds +[default0]: number of documents: 3562772 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [3384633, 3562772) total of 178139 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002572 seconds +[default0]: number of documents: 3562772 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002854 seconds +[default0]: number of documents: 3562772 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_validation_indexmap_32773ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_validation_indexmap_32773ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.047 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003843 seconds +[default0]: number of documents: 2707724 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2572338, 2707724) total of 135386 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006534 seconds +[default0]: number of documents: 2707724 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006124 seconds +[default0]: number of documents: 2707724 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_validation_indexmap_30102ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_validation_indexmap_30102ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.042 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004735 seconds +[default0]: number of documents: 5055942 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [4803145, 5055942) total of 252797 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006722 seconds +[default0]: number of documents: 5055942 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013336 seconds +[default0]: number of documents: 5055942 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_validation_indexmap_29656ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_validation_indexmap_29656ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.059 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002322 seconds +[default0]: number of documents: 2148955 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2041507, 2148955) total of 107448 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002611 seconds +[default0]: number of documents: 2148955 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013636 seconds +[default0]: number of documents: 2148955 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_validation_indexmap_25008ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_validation_indexmap_25008ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.047 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011605 seconds +[default0]: number of documents: 2627392 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2496022, 2627392) total of 131370 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003371 seconds +[default0]: number of documents: 2627392 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012131 seconds +[default0]: number of documents: 2627392 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_validation_indexmap_24628ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_validation_indexmap_24628ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.039 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005218 seconds +[default0]: number of documents: 3560556 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [3382528, 3560556) total of 178028 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006408 seconds +[default0]: number of documents: 3560556 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013297 seconds +[default0]: number of documents: 3560556 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_validation_indexmap_24541ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_validation_indexmap_24541ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.041 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002349 seconds +[default0]: number of documents: 1543441 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1466269, 1543441) total of 77172 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003335 seconds +[default0]: number of documents: 1543441 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.016025 seconds +[default0]: number of documents: 1543441 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_validation_indexmap_23492ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_validation_indexmap_23492ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.028 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002968 seconds +[default0]: number of documents: 1667306 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1583941, 1667306) total of 83365 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003054 seconds +[default0]: number of documents: 1667306 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012147 seconds +[default0]: number of documents: 1667306 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_validation_indexmap_16582ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_validation_indexmap_16582ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002085 seconds +[default0]: number of documents: 855756 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [812968, 855756) total of 42788 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002557 seconds +[default0]: number of documents: 855756 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.017323 seconds +[default0]: number of documents: 855756 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_validation_indexmap_10257ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_validation_indexmap_10257ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.026 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002479 seconds +[default0]: number of documents: 573364 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [544696, 573364) total of 28668 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008455 seconds +[default0]: number of documents: 573364 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001867 seconds +[default0]: number of documents: 573364 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_validation_indexmap_7127ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_validation_indexmap_7127ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.022 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001847 seconds +[default0]: number of documents: 410633 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [390101, 410633) total of 20532 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001559 seconds +[default0]: number of documents: 410633 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001752 seconds +[default0]: number of documents: 410633 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_validation_indexmap_4943ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_validation_indexmap_4943ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001577 seconds +[default0]: number of documents: 428843 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [407401, 428843) total of 21442 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001256 seconds +[default0]: number of documents: 428843 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001154 seconds +[default0]: number of documents: 428843 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_validation_indexmap_4453ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_validation_indexmap_4453ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.022 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001421 seconds +[default0]: number of documents: 417269 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [396406, 417269) total of 20863 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001325 seconds +[default0]: number of documents: 417269 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001231 seconds +[default0]: number of documents: 417269 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_validation_indexmap_3563ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_validation_indexmap_3563ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.024 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002977 seconds +[default0]: number of documents: 1114455 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1058732, 1114455) total of 55723 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002765 seconds +[default0]: number of documents: 1114455 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013265 seconds +[default0]: number of documents: 1114455 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_validation_indexmap_2999ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_validation_indexmap_2999ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.022 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001909 seconds +[default0]: number of documents: 347499 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [330124, 347499) total of 17375 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003844 seconds +[default0]: number of documents: 347499 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002489 seconds +[default0]: number of documents: 347499 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_validation_indexmap_2998ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_validation_indexmap_2998ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001206 seconds +[default0]: number of documents: 339210 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [322250, 339210) total of 16960 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002326 seconds +[default0]: number of documents: 339210 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000980 seconds +[default0]: number of documents: 339210 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_validation_indexmap_2798ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_validation_indexmap_2798ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.021 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001226 seconds +[default0]: number of documents: 315754 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [299966, 315754) total of 15788 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000932 seconds +[default0]: number of documents: 315754 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000810 seconds +[default0]: number of documents: 315754 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_validation_indexmap_2105ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_validation_indexmap_2105ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.032 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003248 seconds +[default0]: number of documents: 918416 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [872495, 918416) total of 45921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002535 seconds +[default0]: number of documents: 918416 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002320 seconds +[default0]: number of documents: 918416 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_validation_indexmap_1887ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_validation_indexmap_1887ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003063 seconds +[default0]: number of documents: 950097 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [902592, 950097) total of 47505 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002515 seconds +[default0]: number of documents: 950097 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014229 seconds +[default0]: number of documents: 950097 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_validation_indexmap_1739ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_validation_indexmap_1739ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.018 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002909 seconds +[default0]: number of documents: 915063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869310, 915063) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003988 seconds +[default0]: number of documents: 915063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009547 seconds +[default0]: number of documents: 915063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_validation_indexmap_1455ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_validation_indexmap_1455ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004578 seconds +[default0]: number of documents: 915061 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869308, 915061) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.019095 seconds +[default0]: number of documents: 915061 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002350 seconds +[default0]: number of documents: 915061 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_validation_indexmap_1410ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_validation_indexmap_1410ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003431 seconds +[default0]: number of documents: 915058 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869305, 915058) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006980 seconds +[default0]: number of documents: 915058 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006009 seconds +[default0]: number of documents: 915058 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_validation_indexmap_1362ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_validation_indexmap_1362ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003041 seconds +[default0]: number of documents: 865056 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [821803, 865056) total of 43253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004631 seconds +[default0]: number of documents: 865056 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007017 seconds +[default0]: number of documents: 865056 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_validation_indexmap_1350ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_validation_indexmap_1350ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.018 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004431 seconds +[default0]: number of documents: 915044 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869292, 915044) total of 45752 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.016982 seconds +[default0]: number of documents: 915044 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002366 seconds +[default0]: number of documents: 915044 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_validation_indexmap_1339ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_validation_indexmap_1339ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.026 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003402 seconds +[default0]: number of documents: 915043 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869291, 915043) total of 45752 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008767 seconds +[default0]: number of documents: 915043 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006407 seconds +[default0]: number of documents: 915043 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_validation_indexmap_1334ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_validation_indexmap_1334ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003800 seconds +[default0]: number of documents: 915021 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869270, 915021) total of 45751 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003881 seconds +[default0]: number of documents: 915021 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003188 seconds +[default0]: number of documents: 915021 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_validation_indexmap_1262ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_validation_indexmap_1262ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.023 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002901 seconds +[default0]: number of documents: 915054 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869301, 915054) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003239 seconds +[default0]: number of documents: 915054 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005957 seconds +[default0]: number of documents: 915054 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_validation_indexmap_1260ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_validation_indexmap_1260ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.029 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002449 seconds +[default0]: number of documents: 915051 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869298, 915051) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002953 seconds +[default0]: number of documents: 915051 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004734 seconds +[default0]: number of documents: 915051 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_validation_indexmap_1229ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_validation_indexmap_1229ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.023 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001658 seconds +[default0]: number of documents: 318189 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [302280, 318189) total of 15909 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001610 seconds +[default0]: number of documents: 318189 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001128 seconds +[default0]: number of documents: 318189 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_validation_indexmap_1032ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_validation_indexmap_1032ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001186 seconds +[default0]: number of documents: 265864 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [252571, 265864) total of 13293 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001020 seconds +[default0]: number of documents: 265864 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000779 seconds +[default0]: number of documents: 265864 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_validation_indexmap_936ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_validation_indexmap_936ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.016 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001239 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000847 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000845 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_validation_indexmap_885ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_validation_indexmap_885ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.020 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000998 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001059 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000958 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_validation_indexmap_864ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_validation_indexmap_864ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001305 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000960 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000867 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_validation_indexmap_836ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_validation_indexmap_836ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001554 seconds +[default0]: number of documents: 365060 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [346807, 365060) total of 18253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001211 seconds +[default0]: number of documents: 365060 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001056 seconds +[default0]: number of documents: 365060 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_validation_indexmap_676ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_validation_indexmap_676ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001737 seconds +[default0]: number of documents: 365063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [346810, 365063) total of 18253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001216 seconds +[default0]: number of documents: 365063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001035 seconds +[default0]: number of documents: 365063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_validation_indexmap_656ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_validation_indexmap_656ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001123 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000869 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000980 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_validation_indexmap_621ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_validation_indexmap_621ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.021 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000677 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251921, 265180) total of 13259 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001109 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000815 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_validation_indexmap_616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_validation_indexmap_616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001203 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000939 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000895 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_validation_indexmap_610ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_validation_indexmap_610ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001105 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002628 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000911 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_validation_indexmap_605ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_validation_indexmap_605ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001093 seconds +[default0]: number of documents: 271191 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [257631, 271191) total of 13560 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001087 seconds +[default0]: number of documents: 271191 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000882 seconds +[default0]: number of documents: 271191 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_validation_indexmap_592ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_validation_indexmap_592ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001096 seconds +[default0]: number of documents: 269973 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [256474, 269973) total of 13499 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000862 seconds +[default0]: number of documents: 269973 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000749 seconds +[default0]: number of documents: 269973 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_validation_indexmap_579ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_validation_indexmap_579ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001001 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251817, 265071) total of 13254 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002379 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000947 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_validation_indexmap_578ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_validation_indexmap_578ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.013 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001012 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251921, 265180) total of 13259 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002841 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000826 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_validation_indexmap_573ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_validation_indexmap_573ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.034 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001394 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251817, 265071) total of 13254 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000844 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000822 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_validation_indexmap_569ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_validation_indexmap_569ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.387164, achieved: 0.387163 +[default0]: dataset 1, input: 0.0786754, achieved: 0.0786745 +[default0]: dataset 2, input: 0.0636898, achieved: 0.0636904 +[default0]: dataset 3, input: 0.0584986, achieved: 0.0584976 +[default0]: dataset 4, input: 0.0576332, achieved: 0.0576328 +[default0]: dataset 5, input: 0.0485994, achieved: 0.0485991 +[default0]: dataset 6, input: 0.0478619, achieved: 0.0478619 +[default0]: dataset 7, input: 0.0476917, achieved: 0.0476916 +[default0]: dataset 8, input: 0.045653, achieved: 0.0456537 +[default0]: dataset 9, input: 0.0322254, achieved: 0.0322257 +[default0]: dataset 10, input: 0.0199319, achieved: 0.0199317 +[default0]: dataset 11, input: 0.0138497, achieved: 0.0138502 +[default0]: dataset 12, input: 0.00960604, achieved: 0.00960574 +[default0]: dataset 13, input: 0.00865243, achieved: 0.00865232 +[default0]: dataset 14, input: 0.00692261, achieved: 0.00692254 +[default0]: dataset 15, input: 0.00582803, achieved: 0.0058278 +[default0]: dataset 16, input: 0.00582586, achieved: 0.0058261 +[default0]: dataset 17, input: 0.00543684, achieved: 0.00543622 +[default0]: dataset 18, input: 0.00409056, achieved: 0.00409121 +[default0]: dataset 19, input: 0.00366562, achieved: 0.00366557 +[default0]: dataset 20, input: 0.00337934, achieved: 0.00337955 +[default0]: dataset 21, input: 0.00282756, achieved: 0.00282792 +[default0]: dataset 22, input: 0.00274012, achieved: 0.00273939 +[default0]: dataset 23, input: 0.00264619, achieved: 0.00264575 +[default0]: dataset 24, input: 0.00262358, achieved: 0.00262362 +[default0]: dataset 25, input: 0.0026003, achieved: 0.00259978 +[default0]: dataset 26, input: 0.00259099, achieved: 0.00259127 +[default0]: dataset 27, input: 0.00245151, achieved: 0.00245166 +[default0]: dataset 28, input: 0.00244735, achieved: 0.00244826 +[default0]: dataset 29, input: 0.00238684, achieved: 0.00238696 +[default0]: dataset 30, input: 0.0020053, achieved: 0.00200559 +[default0]: dataset 31, input: 0.00181876, achieved: 0.00181831 +[default0]: dataset 32, input: 0.00171918, achieved: 0.00171957 +[default0]: dataset 33, input: 0.00167779, achieved: 0.00167871 +[default0]: dataset 34, input: 0.00162359, achieved: 0.00162423 +[default0]: dataset 35, input: 0.00131237, achieved: 0.00131266 +[default0]: dataset 36, input: 0.00127347, achieved: 0.0012735 +[default0]: dataset 37, input: 0.00120564, achieved: 0.0012054 +[default0]: dataset 38, input: 0.00119533, achieved: 0.00119518 +[default0]: dataset 39, input: 0.00118536, achieved: 0.00118497 +[default0]: dataset 40, input: 0.00117488, achieved: 0.00117475 +[default0]: dataset 41, input: 0.00114928, achieved: 0.00114922 +[default0]: dataset 42, input: 0.00112334, achieved: 0.00112368 +[default0]: dataset 43, input: 0.00112318, achieved: 0.00112368 +[default0]: dataset 44, input: 0.00111237, achieved: 0.00111176 +[default0]: dataset 45, input: 0.00110439, achieved: 0.00110495 +[default0]:> elapsed time for building blendable dataset indices: 0.03 (sec) +[default0]:> finished creating T0 datasets ... +[default0]:[after dataloaders are built] datetime: 2022-10-06 12:40:15 +[default0]:done with setup ... +[default0]:training ... +[default0]:Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +[default0]:[000-000] 0.5592B / 0.3023B +[default7]:time (ms) | model-and-optimizer-setup: 9744.61 | train/valid/test-data-iterators-setup: 24469.03 +[default0]:[before the start of training step] datetime: 2022-10-06 12:40:15 +[default0]:[2022-10-06 12:40:15,265] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information +[default0]:[2022-10-06 12:40:15,266] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False +[default0]:[2022-10-06 12:40:15,266] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 24 total layers +[default0]:[2022-10-06 12:40:15,266] [INFO] [checkpointing.py:554:forward] ----Synchronization False +[default0]:[2022-10-06 12:40:15,266] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False +[default0]:[2022-10-06 12:40:24,388] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 4096 +[default0]:[Rank 0] (after 1 iterations) memory (MB) | allocated: 5196.2685546875 | max allocated: 9317.13427734375 | reserved: 10712.0 | max reserved: 10712.0 +[default7]: iteration 1/ 6200 | consumed samples: 1024 | consumed tokens: 2097152 | elapsed time per iteration (s): 9.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 4.731792E+00 | loss scale: 4096.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 108.240 | TFLOPs: 32.99 | +[default0]:[2022-10-06 12:40:31,940] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048.0 +[default0]:saving checkpoint at iteration 2 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 12:40:31,956] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2 is begin to save! +[default7]: iteration 2/ 6200 | consumed samples: 2048 | consumed tokens: 4194304 | elapsed time per iteration (s): 7.31 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 4.705722E+00 | loss scale: 2048.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 140.044 | TFLOPs: 42.69 | +[default0]:[2022-10-06 12:40:32,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:32,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:32,771] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:32,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:32,805] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:32,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:32,837] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:32,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:32,866] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:32,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:32,896] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:32,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:32,927] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:32,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:32,968] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:32,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:32,996] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,028] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,057] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,115] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,230] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,259] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,289] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,318] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,347] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,376] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,434] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,463] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:40:33,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,494] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:33,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,524] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:40:33,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:33,526] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/mp_rank_00_model_states.pt +[default0]:[2022-10-06 12:40:33,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:40:33,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:40:33,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:40:33,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:40:33,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:40:33,810] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:40:33,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:40:33,878] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:40:33,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:40:33,795] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:40:33,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:40:33,826] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:40:33,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:40:33,861] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:40:33,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:40:33,800] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:40:33,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:40:33,876] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:40:33,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:40:33,867] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:40:33,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:40:33,868] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:40:33,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:40:33,828] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:40:33,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:40:33,889] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:40:33,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:40:33,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:40:33,857] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:40:33,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:40:33,926] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:40:33,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:40:33,880] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:40:33,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:40:33,970] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:40:33,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:40:33,938] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:40:33,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:40:33,931] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:40:33,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:40:33,885] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:40:33,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:40:33,937] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:40:33,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:40:33,915] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:40:33,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:40:33,907] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:40:33,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:40:33,940] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:40:33,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:40:33,953] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:40:33,903] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:40:33,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:40:33,998] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:40:33,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:40:33,997] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:40:33,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:40:33,990] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:40:34,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:40:34,013] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:40:34,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:40:34,043] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:40:34,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:40:34,025] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default1]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default4]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default2]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default0]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default7]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default0]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default3]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default4]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default7]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default5]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default4]:[2022-10-06 12:40:34,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:40:34,103] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default0]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default5]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default2]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default5]:[2022-10-06 12:40:34,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:40:34,102] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default6]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default1]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default1]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default0]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default0]: successfully saved checkpoint at iteration 2 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default6]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default2]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default2]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default7]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default3]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default3]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default6]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default4]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default6]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default1]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default7]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default7]:time (ms) | save-checkpoint: 2159.48 +[default5]:[2022-10-06 12:40:34,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2 is ready now! +[default0]:[2022-10-06 12:40:41,412] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 +[default7]: iteration 3/ 6200 | consumed samples: 3072 | consumed tokens: 6291456 | elapsed time per iteration (s): 9.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 4.747199E+00 | loss scale: 1024.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 108.103 | TFLOPs: 32.95 | +[default0]:[2022-10-06 12:40:48,696] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1024.0, reducing to 512.0 +[default0]:saving checkpoint at iteration 4 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 12:40:48,703] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4 is begin to save! +[default0]:[2022-10-06 12:40:48,705] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_01-model_00-model_states.pt... +[default7]: iteration 4/ 6200 | consumed samples: 4096 | consumed tokens: 8388608 | elapsed time per iteration (s): 7.28 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 4.828533E+00 | loss scale: 512.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 140.592 | TFLOPs: 42.85 | +[default0]:[2022-10-06 12:40:49,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,138] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,230] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,259] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,288] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,316] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,375] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,404] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,433] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,491] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,550] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,581] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,622] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,718] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,747] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,844] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:40:49,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,875] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,905] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:40:49,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:40:49,907] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/mp_rank_00_model_states.pt +[default0]:[2022-10-06 12:40:49,907] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:40:49,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:40:49,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:40:50,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:40:50,182] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:40:50,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:40:50,183] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:40:50,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:40:50,188] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:40:50,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:40:50,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:40:50,166] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:40:50,170] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:40:50,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:40:50,185] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:40:50,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:40:50,196] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:40:50,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:40:50,216] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:40:50,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:40:50,202] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:40:50,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:40:50,216] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:40:50,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:40:50,208] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:40:50,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:40:50,287] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:40:50,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:40:50,253] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:40:50,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:40:50,295] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:40:50,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:40:50,295] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:40:50,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:40:50,212] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:40:50,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:40:50,217] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:40:50,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:40:50,219] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:40:50,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:40:50,296] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:40:50,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:40:50,252] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:40:50,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:40:50,274] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:40:50,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:40:50,280] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:40:50,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:40:50,297] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:40:50,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:40:50,295] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:40:50,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:40:50,315] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:40:50,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:40:50,276] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:40:50,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:40:50,291] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default2]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default0]:[2022-10-06 12:40:50,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:40:50,327] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default7]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default0]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default4]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default3]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default4]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default7]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default5]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default2]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default5]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default0]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default6]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default5]:[2022-10-06 12:40:50,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:40:50,377] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default1]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default1]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default0]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default0]: successfully saved checkpoint at iteration 4 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default6]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default2]:[2022-10-06 12:40:50,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:40:50,336] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default2]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default3]:[2022-10-06 12:40:50,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:40:50,324] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default6]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default7]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default4]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default1]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default3]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default6]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default5]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default7]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default7]:time (ms) | save-checkpoint: 1677.04 +[default3]:[2022-10-06 12:40:50,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:40:50,362] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default1]:[2022-10-06 12:40:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4 is ready now! +[default7]: iteration 5/ 6200 | consumed samples: 5120 | consumed tokens: 10485760 | elapsed time per iteration (s): 9.16 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 4.725605E+00 | loss scale: 512.0 | grad norm: 236.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 111.812 | TFLOPs: 34.08 | +[default0]:saving checkpoint at iteration 6 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 12:41:05,319] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step6 is begin to save! +[default0]:[2022-10-06 12:41:05,321] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_01-model_00-model_states.pt... +[default7]: iteration 6/ 6200 | consumed samples: 6144 | consumed tokens: 12582912 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 3.944874E+00 | loss scale: 512.0 | grad norm: 150.833 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.302 | TFLOPs: 41.85 | +[default0]:[2022-10-06 12:41:05,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:05,784] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:05,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:05,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:05,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:05,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:05,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:05,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:05,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:05,909] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:05,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:05,938] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:05,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:05,967] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:05,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:05,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,025] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,090] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,119] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,321] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,349] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,380] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:41:06,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,445] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,474] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:41:06,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,502] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:06,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:06,504] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/mp_rank_00_model_states.pt +[default0]:[2022-10-06 12:41:06,504] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:41:06,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:41:06,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:41:06,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:41:06,715] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:41:06,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:41:06,797] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:41:06,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:41:06,758] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:41:06,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:41:06,801] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:41:06,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:41:06,811] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:41:06,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:41:06,819] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:41:06,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:41:06,738] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:41:06,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:41:06,770] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:41:06,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:41:06,832] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:41:06,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:41:06,830] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:41:06,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:41:06,851] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:41:06,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:41:06,856] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:41:06,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:41:06,899] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:41:06,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:41:06,840] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:41:06,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:41:06,854] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:41:06,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:41:06,831] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:41:06,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:41:06,896] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:41:06,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:41:06,849] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:41:06,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:41:06,850] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:41:06,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:41:06,878] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:41:06,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:41:06,849] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:41:06,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:41:06,850] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:41:07,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:41:07,006] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:41:07,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:41:07,022] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:41:07,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:41:07,002] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:41:07,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:41:07,039] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:41:07,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:41:07,037] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:41:07,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:41:07,103] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default5]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default1]:[2022-10-06 12:41:07,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default1]:[2022-10-06 12:41:07,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:41:07,087] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default0]:[2022-10-06 12:41:07,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:41:07,037] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default0]: successfully saved checkpoint at iteration 6 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default6]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default2]:[2022-10-06 12:41:07,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:41:07,094] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default2]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default6]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default3]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default1]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default3]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default6]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default4]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default7]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default5]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default7]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default7]:time (ms) | save-checkpoint: 1807.51 +[default3]:[2022-10-06 12:41:07,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:41:07,124] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default1]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default4]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default2]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default0]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default7]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default0]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default3]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default4]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default4]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default6]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default7]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default0]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default5]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default2]:[2022-10-06 12:41:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6 is ready now! +[default7]: iteration 7/ 6200 | consumed samples: 7168 | consumed tokens: 14680064 | elapsed time per iteration (s): 9.27 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 3.696283E+00 | loss scale: 512.0 | grad norm: 175.994 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 110.463 | TFLOPs: 33.67 | +[default0]:saving checkpoint at iteration 8 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 12:41:22,023] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step8 is begin to save! +[default0]:[2022-10-06 12:41:22,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_01-model_00-model_states.pt... +[default7]: iteration 8/ 6200 | consumed samples: 8192 | consumed tokens: 16777216 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 3.499207E+00 | loss scale: 512.0 | grad norm: 100.583 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.744 | TFLOPs: 41.99 | +[default0]:[2022-10-06 12:41:22,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,475] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,508] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,578] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,672] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,762] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,793] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,823] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,852] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,885] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,914] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:22,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:22,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:23,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:23,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:23,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:23,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:23,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:23,062] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:23,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:23,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:23,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:23,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:23,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:23,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:23,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:23,190] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:23,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:23,219] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:23,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:23,221] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/mp_rank_00_model_states.pt +[default0]:[2022-10-06 12:41:23,221] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:41:23,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:41:23,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:41:23,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:41:23,493] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:41:23,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:41:23,556] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:41:23,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:41:23,533] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:41:23,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:41:23,485] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:41:23,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:41:23,587] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:41:23,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:41:23,628] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:41:23,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:41:23,579] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:41:23,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:41:23,600] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:41:23,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:41:23,632] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:41:23,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:41:23,581] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:41:23,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:41:23,581] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:41:23,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:41:23,636] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:41:23,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:41:23,556] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:41:23,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:41:23,627] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:41:23,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:41:23,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:41:23,651] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:41:23,651] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:41:23,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:41:23,652] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:41:23,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:41:23,599] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:41:23,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:41:23,635] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:41:23,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:41:23,659] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:41:23,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:41:23,676] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:41:23,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:41:23,643] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:41:23,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:41:23,688] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:41:23,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:41:23,667] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:41:23,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:41:23,741] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:41:23,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:41:23,737] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:41:23,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:41:23,686] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:41:23,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:41:23,710] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:41:23,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:41:23,681] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default7]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default7]:time (ms) | save-checkpoint: 1737.11 +[default5]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default3]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default1]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default4]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default2]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default7]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default0]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default0]:[2022-10-06 12:41:23,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:41:23,741] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default3]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default4]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default4]:[2022-10-06 12:41:23,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:41:23,758] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default6]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default7]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default0]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default5]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default2]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default1]:[2022-10-06 12:41:23,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:41:23,753] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step8/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default5]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default5]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default1]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default0]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default0]: successfully saved checkpoint at iteration 8 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default2]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default6]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default2]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default6]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default3]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default3]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default6]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default4]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default1]:[2022-10-06 12:41:23,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8 is ready now! +[default7]: iteration 9/ 6200 | consumed samples: 9216 | consumed tokens: 18874368 | elapsed time per iteration (s): 9.16 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 3.386456E+00 | loss scale: 512.0 | grad norm: 44.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 111.730 | TFLOPs: 34.06 | +[default0]:saving checkpoint at iteration 10 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 12:41:38,624] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step10 is begin to save! +[default0]:[2022-10-06 12:41:38,627] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default7]: iteration 10/ 6200 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 3.282778E+00 | loss scale: 512.0 | grad norm: 30.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.703 | TFLOPs: 41.97 | +[default0]:[2022-10-06 12:41:39,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,089] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,151] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,180] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,209] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,237] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,267] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,300] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,331] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,361] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,390] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,447] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,477] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,506] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,610] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,638] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,667] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,698] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,755] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:41:39,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,783] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:41:39,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,811] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:41:39,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:41:39,813] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt +[default0]:[2022-10-06 12:41:39,813] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:41:39,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:41:39,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:41:40,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:41:40,118] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:41:40,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:41:40,054] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:41:40,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:41:40,147] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:41:40,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:41:40,072] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:41:40,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:41:40,133] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:41:40,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:41:40,146] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:41:40,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:41:40,132] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:41:40,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:41:40,115] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:41:40,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:41:40,161] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 12:41:40,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:41:40,157] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:41:40,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:41:40,139] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:41:40,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:41:40,160] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:41:40,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:41:40,145] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:41:40,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:41:40,147] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:41:40,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:41:40,119] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:41:40,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:41:40,198] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:41:40,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:41:40,199] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:41:40,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:41:40,232] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:41:40,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:41:40,172] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:41:40,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:41:40,162] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:41:40,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:41:40,194] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:41:40,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:41:40,242] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:41:40,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:41:40,268] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:41:40,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:41:40,217] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:41:40,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:41:40,227] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 12:41:40,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:41:40,256] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 12:41:40,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:41:40,267] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 12:41:40,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:41:40,209] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:41:40,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:41:40,231] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default2]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default7]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default0]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default6]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default4]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default3]:[2022-10-06 12:41:40,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:41:40,270] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default0]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default4]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default7]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default5]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default0]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default2]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default5]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default1]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default5]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default1]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default0]:[2022-10-06 12:41:40,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:41:40,279] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default0]: successfully saved checkpoint at iteration 10 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default2]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default6]:[2022-10-06 12:41:40,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:41:40,296] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default2]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default6]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default3]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default4]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default3]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default6]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default1]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default7]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default7]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default7]:time (ms) | save-checkpoint: 1673.38 +[default5]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default3]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default1]:[2022-10-06 12:41:40,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10 is ready now! +[default7]: iteration 11/ 6200 | consumed samples: 11264 | consumed tokens: 23068672 | elapsed time per iteration (s): 9.12 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 3.188134E+00 | loss scale: 512.0 | grad norm: 22.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 112.247 | TFLOPs: 34.21 | +srun: Job step aborted: Waiting up to 62 seconds for job step to finish. +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 693967 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 693968 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +slurmstepd: error: *** STEP 2075274.0 ON jean-zay-iam14 CANCELLED AT 2022-10-06T12:41:51 *** +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3157691 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 693969 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3157692 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3542039 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3157693 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3542040 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 283914 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 693970 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 283915 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3157694 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3542041 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 283916 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3542042 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 693971 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 693972 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3157695 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3157696 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 283917 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 693973 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 283918 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3542043 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 693974 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 283919 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 283920 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 283921 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3157697 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3157698 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3542044 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3542045 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3542046 closing signal SIGTERM +[default2]:Traceback (most recent call last): +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/multiprocessing/resource_sharer.py", line 142, in _serve +[default2]: with self._listener.accept() as conn: +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/multiprocessing/connection.py", line 465, in accept +[default2]: deliver_challenge(c, self._authkey) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/multiprocessing/connection.py", line 740, in deliver_challenge +[default2]: response = connection.recv_bytes(256) # reject large message +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes +[default2]: buf = self._recv_bytes(maxlength) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes +[default2]: buf = self._recv(4) +[default2]: File "/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/multiprocessing/connection.py", line 379, in _recv +[default2]: chunk = read(handle, remaining) +[default2]:ConnectionResetError: [Errno 104] Connection reset by peer +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[default0]:using world size: 32, data-parallel-size: 32, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +[default0]:using torch.float16 for parameters ... +[default0]:------------------------ arguments ------------------------ +[default0]: abort_on_unmet_fused_kernel_constraints ......... True +[default0]: accumulate_allreduce_grads_in_fp32 .............. False +[default0]: adam_beta1 ...................................... 0.9 +[default0]: adam_beta2 ...................................... 0.95 +[default0]: adam_eps ........................................ 1e-08 +[default0]: adlr_autoresume ................................. False +[default0]: adlr_autoresume_interval ........................ 1000 +[default0]: apply_query_key_layer_scaling ................... True +[default0]: apply_residual_connection_post_layernorm ........ False +[default0]: attention_dropout ............................... 0.1 +[default0]: attention_softmax_in_fp32 ....................... False +[default0]: bert_binary_head ................................ True +[default0]: bert_load ....................................... None +[default0]: bf16 ............................................ False +[default0]: bias_dropout_fusion ............................. True +[default0]: bias_gelu_fusion ................................ True +[default0]: biencoder_projection_dim ........................ 0 +[default0]: biencoder_shared_query_context_model ............ False +[default0]: block_data_path ................................. None +[default0]: checkpoint_activations .......................... True +[default0]: checkpoint_in_cpu ............................... False +[default0]: checkpoint_num_layers ........................... 1 +[default0]: clip_grad ....................................... 1.0 +[default0]: codecarbon_dir .................................. None +[default0]: consumed_train_samples .......................... 0 +[default0]: consumed_train_tokens ........................... 0 +[default0]: consumed_valid_samples .......................... 0 +[default0]: contigious_checkpointing ........................ False +[default0]: cpu_optimizer ................................... False +[default0]: cpu_torch_adam .................................. False +[default0]: curriculum_learning ............................. False +[default0]: data_impl ....................................... mmap +[default0]: data_parallel_size .............................. 32 +[default0]: data_path ....................................... None +[default0]: dataloader_type ................................. single +[default0]: DDP_impl ........................................ local +[default0]: decoder_seq_length .............................. None +[default0]: deepscale ....................................... False +[default0]: deepscale_config ................................ None +[default0]: deepspeed ....................................... True +[default0]: deepspeed_activation_checkpointing .............. True +[default0]: deepspeed_config ................................ ./ds_config.2075302.json +[default0]: deepspeed_mpi ................................... False +[default0]: distribute_checkpointed_activations ............. False +[default0]: distributed_backend ............................. nccl +[default0]: embed_layernorm ................................. True +[default0]: embedding_path .................................. None +[default0]: encoder_seq_length .............................. 2048 +[default0]: eod_mask_loss ................................... False +[default0]: eval_interval ................................... 125 +[default0]: eval_iters ...................................... 10 +[default0]: eval_only ....................................... None +[default0]: evidence_data_path .............................. None +[default0]: exit_duration_in_mins ........................... 5990 +[default0]: exit_interval ................................... None +[default0]: ffn_hidden_size ................................. 4096 +[default0]: finetune ........................................ False +[default0]: fp16 ............................................ True +[default0]: fp16_lm_cross_entropy ........................... False +[default0]: fp32_residual_connection ........................ False +[default0]: gigaflos_no_embeds .............................. 0 +[default0]: global_batch_size ............................... 1024 +[default0]: glu_activation .................................. None +[default0]: hidden_dropout .................................. 0.1 +[default0]: hidden_size ..................................... 1024 +[default0]: hysteresis ...................................... 2 +[default0]: ict_head_size ................................... None +[default0]: ict_load ........................................ None +[default0]: img_dim ......................................... 224 +[default0]: indexer_batch_size .............................. 128 +[default0]: indexer_log_interval ............................ 1000 +[default0]: inference ....................................... False +[default0]: init_method_std ................................. 0.0048 +[default0]: init_method_xavier_uniform ...................... False +[default0]: initial_loss_scale .............................. 4294967296 +[default0]: kill_switch_path ................................ /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/kill-switch-tr13e-350M-mtf +[default0]: kv_channels ..................................... 64 +[default0]: layernorm_epsilon ............................... 1e-05 +[default0]: lazy_mpu_init ................................... None +[default0]: load ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: local_rank ...................................... None +[default0]: log_batch_size_to_tensorboard ................... True +[default0]: log_interval .................................... 1 +[default0]: log_learning_rate_to_tensorboard ................ True +[default0]: log_level ....................................... None +[default0]: log_level_replica ............................... None +[default0]: log_loss_scale_to_tensorboard ................... True +[default0]: log_num_zeros_in_grad ........................... False +[default0]: log_params_norm ................................. False +[default0]: log_path ........................................ None +[default0]: log_timers_to_tensorboard ....................... True +[default0]: log_validation_ppl_to_tensorboard ............... True +[default0]: loss_on_targets_only ............................ False +[default0]: loss_scale ...................................... None +[default0]: loss_scale_window ............................... 1000 +[default0]: lr .............................................. 2e-05 +[default0]: lr_decay_iters .................................. None +[default0]: lr_decay_samples ................................ None +[default0]: lr_decay_style .................................. constant +[default0]: lr_decay_tokens ................................. None +[default0]: lr_warmup_fraction .............................. None +[default0]: lr_warmup_iters ................................. 0 +[default0]: lr_warmup_samples ............................... 0 +[default0]: make_vocab_size_divisible_by .................... 128 +[default0]: mask_prob ....................................... 0.15 +[default0]: masked_softmax_fusion ........................... True +[default0]: max_position_embeddings ......................... 2048 +[default0]: mean_noise_span_length .......................... None +[default0]: memory_centric_tiled_linear ..................... False +[default0]: merge_file ...................................... None +[default0]: micro_batch_size ................................ 1 +[default0]: min_loss_scale .................................. 1.0 +[default0]: min_lr .......................................... 0.0 +[default0]: mmap_warmup ..................................... False +[default0]: no_load_optim ................................... None +[default0]: no_load_rng ..................................... None +[default0]: no_save_optim ................................... None +[default0]: no_save_rng ..................................... None +[default0]: noise_density ................................... None +[default0]: norm_target_loss ................................ True +[default0]: num_attention_heads ............................. 16 +[default0]: num_channels .................................... 3 +[default0]: num_classes ..................................... 1000 +[default0]: num_layers ...................................... 24 +[default0]: num_layers_per_virtual_pipeline_stage ........... None +[default0]: num_workers ..................................... 2 +[default0]: onnx_safe ....................................... None +[default0]: openai_gelu ..................................... False +[default0]: optimizer ....................................... adam +[default0]: override_lr_scheduler ........................... False +[default0]: pad_vocab_size_to ............................... 250880 +[default0]: params_dtype .................................... torch.float16 +[default0]: partition_activations ........................... False +[default0]: patch_dim ....................................... 16 +[default0]: pipeline_model_parallel_size .................... 1 +[default0]: position_embedding_type ......................... PositionEmbeddingType.alibi +[default0]: pp_partition_method ............................. type:transformer|embedding +[default0]: prefixlm ........................................ False +[default0]: profile_backward ................................ False +[default0]: query_in_block_prob ............................. 0.1 +[default0]: rampup_batch_size ............................... None +[default0]: rank ............................................ 0 +[default0]: remote_device ................................... none +[default0]: reset_attention_mask ............................ False +[default0]: reset_position_ids .............................. False +[default0]: reset_progress .................................. None +[default0]: retriever_report_topk_accuracies ................ [] +[default0]: retriever_score_scaling ......................... False +[default0]: retriever_seq_length ............................ 256 +[default0]: reweight_loss_based_on_position_frequency ....... False +[default0]: sample_rate ..................................... 1.0 +[default0]: save ............................................ /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]: save_interval ................................... 250 +[default0]: scatter_gather_tensors_in_pipeline .............. True +[default0]: scattered_embeddings ............................ False +[default0]: seed ............................................ 42 +[default0]: seq_length ...................................... 2048 +[default0]: sgd_momentum .................................... 0.9 +[default0]: short_seq_prob .................................. 0.1 +[default0]: skip_train_iteration_range ...................... None +[default0]: split ........................................... None +[default0]: split_transformers .............................. False +[default0]: sync_tp_duplicated_parameters ................... False +[default0]: synchronize_each_layer .......................... False +[default0]: tensor_model_parallel_size ...................... 1 +[default0]: tensorboard_dir ................................. /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/tr13e-350M-ml-logs/tensorboard/xp3capmixnewcodelonglossseq +[default0]: tensorboard_log_interval ........................ 1 +[default0]: tensorboard_queue_size .......................... 5 +[default0]: test_weighted_split_paths ....................... None +[default0]: test_weighted_split_paths_path .................. None +[default0]: tile_factor ..................................... 1 +[default0]: titles_data_path ................................ None +[default0]: tokenizer_name_or_path .......................... bigscience/tokenizer +[default0]: tokenizer_type .................................. PretrainedFromHF +[default0]: train_iters ..................................... None +[default0]: train_samples ................................... 6348800 +[default0]: train_tokens .................................... None +[default0]: train_weighted_split_names ...................... ['train'] +[default0]: train_weighted_split_paths ...................... [['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: train_weighted_split_paths_path ................. None +[default0]: train_weighted_split_splits ..................... [['0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950', '0:0.950']] +[default0]: train_weighted_split_weights .................... [['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: universal_checkpoint ............................ False +[default0]: use_bnb_optimizer ............................... False +[default0]: use_checkpoint_lr_scheduler ..................... False +[default0]: use_contiguous_buffers_in_ddp ................... False +[default0]: use_cpu_initialization .......................... None +[default0]: use_one_sent_docs ............................... False +[default0]: use_pin_memory .................................. False +[default0]: valid_num_workers ............................... 2 +[default0]: valid_weighted_split_names ...................... ['validation_pretraining', 'validation'] +[default0]: valid_weighted_split_paths ...................... [['/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document', '/gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document'], ['/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm', '/gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw']] +[default0]: valid_weighted_split_paths_path ................. None +[default0]: valid_weighted_split_splits ..................... [['0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0', '0.950:1.0'], ['0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1', '0.950:1']] +[default0]: valid_weighted_split_weights .................... [['0.0330676168743166', '0.011242051312222764', '0.13027200903379185', '0.22171164529099704', '0.10667815627928671', '0.0015595123898173287', '0.13054018439603915', '0.01091803753667153', '0.00011021422347108609', '0.005492381453597748', '0.0004021215011318779', '0.007470068593492175', '0.0006190467776576425', '0.0010335296343329384', '0.0005012010684646179', '0.0006672772956128299', '0.00035928138344705506', '0.0005084433130291778', '0.0021137328219915496', '0.0009129946225980253', '0.0012454301613725426', '0.00031588689199263235', '0.08137213783015229', '0.055293935695898196', '0.04954150576361177', '0.02461641286531197', '0.12091748245519074', '0.0005177025345001541'], ['0.3924620202', '0.0797519865', '0.0645613968', '0.0592991658', '0.0584218969', '0.0492644683', '0.0485168956', '0.048344327', '0.0462777165', '0.0326663657', '0.0202046859', '0.0140392334', '0.0097374884', '0.0087708344', '0.0070173416', '0.0059077793', '0.0059055884', '0.0055112422', '0.0041465344', '0.0037157869', '0.0034255885', '0.0028662571', '0.0027776135', '0.0026823974', '0.0026594781', '0.0026358847', '0.0026264474', '0.0024850557', '0.0024808426', '0.0024194999', '0.0020327371', '0.0018436532', '0.0017427072', '0.0017007448', '0.0016458059', '0.0013303289', '0.0012908943', '0.0012221364', '0.001211688', '0.0012015765', '0.0011909595', '0.0011650068', '0.001138717', '0.0011385485', '0.0011275944', '0.0011195053']] +[default0]: virtual_pipeline_model_parallel_size ............ None +[default0]: vocab_extra_ids ................................. 0 +[default0]: vocab_file ...................................... None +[default0]: weight_decay .................................... 0.0001 +[default0]: world_size ...................................... 32 +[default0]: zero_allgather_bucket_size ...................... 0.0 +[default0]: zero_contigious_gradients ....................... False +[default0]: zero_reduce_bucket_size ......................... 0.0 +[default0]: zero_reduce_scatter ............................. False +[default0]: zero_stage ...................................... 1 +[default0]:-------------------- end of arguments --------------------- +[default0]:setting number of micro-batches to constant 32 +[default0]:> building PretrainedFromHF tokenizer ... +[default0]: vocab file is un-used. loading tokenizer from pre-trained model +[default0]:Offline mode: forcing local_files_only=True +[default0]:Offline mode: forcing local_files_only=True +[default0]:loading file tokenizer.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer.json +[default0]:loading file added_tokens.json from cache at None +[default0]:loading file special_tokens_map.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/special_tokens_map.json +[default0]:loading file tokenizer_config.json from cache at /gpfswork/rech/six/commun/models/models--bigscience--tokenizer/snapshots/d56c744c331ce0ffa516ed084785eb22da74b91e/tokenizer_config.json +[default0]: > padded vocab (size: 250680) with 200 dummy tokens (new size: 250880) +[default0]:DeepSpeed general environment info: +[default0]:torch install path ............... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/torch'] +[default0]:torch version .................... 1.12.0 +[default0]:torch cuda version ............... 11.3 +[default0]:torch hip version ................ None +[default0]:nvcc version ..................... 11.4 +[default0]:deepspeed install path ........... ['/gpfswork/rech/six/commun/conda/tr13f-6B3-ml-t0/lib/python3.8/site-packages/deepspeed'] +[default0]:deepspeed info ................... 0.7.1+8b2a6371, 8b2a6371, master +[default0]:deepspeed wheel compiled w. ...... torch 1.12, cuda 11.3 +[default0]:**** Git info for Megatron: git_hash=6c1018f git_branch=mtf-multival **** +[default0]:> initializing torch distributed ... +[default0]:[2022-10-06 12:43:02,814] [INFO] [comm.py:628:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[default7]:> setting tensorboard ... +[default0]:> initializing tensor model parallel with size 1 +[default0]:> initializing pipeline model parallel with size 1 +[default0]:> setting random seeds to 42 ... +[default0]:[2022-10-06 12:43:04,832] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 2760 and data parallel seed: 42 +[default0]:> compiling dataset index builder ... +[default0]:make: Entering directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/data' +[default0]:make: Nothing to be done for 'default'. +[default0]:make: Leaving directory '/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/data' +[default0]:>>> done with dataset index builder. Compilation time: 0.058 seconds +[default0]:> compiling and loading fused kernels ... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_upper_triang_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module scaled_masked_softmax_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module scaled_masked_softmax_cuda... +[default0]:Detected CUDA files, patching ldflags +[default0]:Emitting ninja build file /gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +[default0]:Building extension module fused_mix_prec_layer_norm_cuda... +[default0]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default0]:ninja: no work to do. +[default0]:Loading extension module fused_mix_prec_layer_norm_cuda... +[default0]:>>> done with compiling and loading fused kernels. Compilation time: 6.455 seconds +[default0]:time to initialize megatron (seconds): 63.347 +[default0]:[after megatron is initialized] datetime: 2022-10-06 12:43:11 +[default0]:building GPT model ... +[default0]:[2022-10-06 12:43:11,394] [INFO] [utils.py:827:see_memory_usage] Before Building Model +[default0]:[2022-10-06 12:43:11,394] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +[default0]:[2022-10-06 12:43:11,394] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.32 GB, percent = 6.6% +[default0]:SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +[default0]:Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pipe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31} +[default0]:[2022-10-06 12:43:12,367] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer|embedding +[default0]:stage=0 layers=31 +[default0]: 0: _to_float16 +[default0]: 1: EmbeddingPipe +[default0]: 2: +[default0]: 3: ParallelTransformerLayerPipe +[default0]: 4: ParallelTransformerLayerPipe +[default0]: 5: ParallelTransformerLayerPipe +[default0]: 6: ParallelTransformerLayerPipe +[default0]: 7: ParallelTransformerLayerPipe +[default0]: 8: ParallelTransformerLayerPipe +[default0]: 9: ParallelTransformerLayerPipe +[default0]: 10: ParallelTransformerLayerPipe +[default0]: 11: ParallelTransformerLayerPipe +[default0]: 12: ParallelTransformerLayerPipe +[default0]: 13: ParallelTransformerLayerPipe +[default0]: 14: ParallelTransformerLayerPipe +[default0]: 15: ParallelTransformerLayerPipe +[default0]: 16: ParallelTransformerLayerPipe +[default0]: 17: ParallelTransformerLayerPipe +[default0]: 18: ParallelTransformerLayerPipe +[default0]: 19: ParallelTransformerLayerPipe +[default0]: 20: ParallelTransformerLayerPipe +[default0]: 21: ParallelTransformerLayerPipe +[default0]: 22: ParallelTransformerLayerPipe +[default0]: 23: ParallelTransformerLayerPipe +[default0]: 24: ParallelTransformerLayerPipe +[default0]: 25: ParallelTransformerLayerPipe +[default0]: 26: ParallelTransformerLayerPipe +[default0]: 27: undo +[default0]: 28: MixedFusedLayerNorm +[default0]: 29: EmbeddingPipe +[default0]: 30: float16_to_fp32 +[default0]: loss: CrossEntropy +[default0]:[2022-10-06 12:43:12,454] [INFO] [utils.py:827:see_memory_usage] After Building Model +[default0]:[2022-10-06 12:43:12,455] [INFO] [utils.py:828:see_memory_usage] MA 1.04 GB Max_MA 1.04 GB CA 1.09 GB Max_CA 1 GB +[default0]:[2022-10-06 12:43:12,455] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.44 GB, percent = 6.6% +[default0]:setting training iterations to 6200 +[default0]:> learning rate decay style: constant +[default0]:DeepSpeed is enabled. +[default0]:[2022-10-06 12:43:12,457] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.1+8b2a6371, git-hash=8b2a6371, git-branch=master +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:[2022-10-06 12:43:14,686] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[default0]:[2022-10-06 12:43:14,686] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +[default0]:[2022-10-06 12:43:14,686] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[default0]:[2022-10-06 12:43:14,695] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +[default0]:[2022-10-06 12:43:14,695] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[default0]:[2022-10-06 12:43:14,695] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 1 optimizer +[default0]:[2022-10-06 12:43:14,695] [INFO] [stage_1_and_2.py:134:__init__] Reduce bucket size 500000000 +[default0]:[2022-10-06 12:43:14,695] [INFO] [stage_1_and_2.py:135:__init__] Allgather bucket size 500000000 +[default0]:[2022-10-06 12:43:14,695] [INFO] [stage_1_and_2.py:136:__init__] CPU Offload: False +[default0]:[2022-10-06 12:43:14,695] [INFO] [stage_1_and_2.py:137:__init__] Round robin gradient partitioning: False +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:Emitting ninja build file /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113/utils/build.ninja... +[default4]:Building extension module utils... +[default4]:Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[default4]:ninja: no work to do. +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.4331362247467041 seconds +[default3]:Loading extension module utils... +[default6]:Loading extension module utils... +[default7]:Loading extension module utils... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.4859006404876709 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.48593997955322266 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.486081600189209 seconds +[default4]:Loading extension module utils... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.4859657287597656 seconds +[default4]:Time to load utils op: 0.48595142364501953 seconds +[default5]:Loading extension module utils... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.4861295223236084 seconds +[default5]:Time to load utils op: 0.485795259475708 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.4859485626220703 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.4965400695800781 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.49652791023254395 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.47687602043151855 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.47657251358032227 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.4745211601257324 seconds +[default5]:Loading extension module utils... +[default3]:Loading extension module utils... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.4764242172241211 seconds +[default5]:Time to load utils op: 0.4769103527069092 seconds +[default3]:Time to load utils op: 0.4769260883331299 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.4769022464752197 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.48516368865966797 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.4938030242919922 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.4851677417755127 seconds +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.49485039710998535 seconds +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.49595046043395996 seconds +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.4853029251098633 seconds +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.4851679801940918 seconds +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.48517274856567383 seconds +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.4851839542388916 seconds +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.48517489433288574 seconds +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.48517489433288574 seconds +[default3]:Time to load utils op: 0.49511051177978516 seconds +[default6]:Time to load utils op: 0.49346184730529785 seconds +[default7]:Time to load utils op: 0.49272894859313965 seconds +[default7]:Rank: 23 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 20 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 26 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 27 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 18 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 16 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 19 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 17 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 2 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 3 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 21 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 22 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 5 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default3]:Rank: 11 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 6 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 7 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 4 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 8 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 9 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Rank: 12 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default2]:Rank: 10 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 1 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 0 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 14 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Rank: 13 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 15 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default6]:Rank: 30 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default7]:Rank: 31 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default1]:Rank: 25 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Rank: 24 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0012843608856201172 seconds +[default4]:Rank: 28 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0018649101257324219 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0017642974853515625 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0016601085662841797 seconds +[default5]:Rank: 29 partition count [32, 32, 32] and sizes[(12615680, False), (4849664, False), (10112, False)] +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0014886856079101562 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0017323493957519531 seconds +[default6]:Time to load utils op: 0.001420736312866211 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0013861656188964844 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0017347335815429688 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0004990100860595703 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0012590885162353516 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0005161762237548828 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0005831718444824219 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default3]:Loading extension module utils... +[default6]:Time to load utils op: 0.0005276203155517578 seconds +[default5]:Time to load utils op: 0.0004971027374267578 seconds +[default3]:Time to load utils op: 0.0005524158477783203 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default1]:Time to load utils op: 0.0005381107330322266 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.001438140869140625 seconds +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.00179290771484375 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0020651817321777344 seconds +[default4]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default4]:No modifications detected for re-loaded extension module utils, skipping build step... +[default4]:Loading extension module utils... +[default4]:Time to load utils op: 0.0016722679138183594 seconds +[default2]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default2]:No modifications detected for re-loaded extension module utils, skipping build step... +[default2]:Loading extension module utils... +[default2]:Time to load utils op: 0.0016629695892333984 seconds +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.0013935565948486328 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0015747547149658203 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0014848709106445312 seconds +[default1]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default1]:No modifications detected for re-loaded extension module utils, skipping build step... +[default1]:Loading extension module utils... +[default5]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default5]:No modifications detected for re-loaded extension module utils, skipping build step... +[default5]:Loading extension module utils... +[default5]:Time to load utils op: 0.0020546913146972656 seconds +[default1]:Time to load utils op: 0.002543926239013672 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0016417503356933594 seconds +[default3]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default3]:No modifications detected for re-loaded extension module utils, skipping build step... +[default3]:Loading extension module utils... +[default3]:Time to load utils op: 0.0015134811401367188 seconds +[default6]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default6]:No modifications detected for re-loaded extension module utils, skipping build step... +[default6]:Loading extension module utils... +[default6]:Time to load utils op: 0.0017571449279785156 seconds +[default7]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default7]:No modifications detected for re-loaded extension module utils, skipping build step... +[default7]:Loading extension module utils... +[default7]:Time to load utils op: 0.0018200874328613281 seconds +[default0]:[2022-10-06 12:43:17,472] [INFO] [utils.py:827:see_memory_usage] Before initializing optimizer states +[default0]:[2022-10-06 12:43:17,473] [INFO] [utils.py:828:see_memory_usage] MA 1.11 GB Max_MA 1.12 GB CA 1.84 GB Max_CA 2 GB +[default0]:[2022-10-06 12:43:17,473] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.3 GB, percent = 7.2% +[default0]:[2022-10-06 12:43:17,512] [INFO] [utils.py:827:see_memory_usage] After initializing optimizer states +[default0]:[2022-10-06 12:43:17,513] [INFO] [utils.py:828:see_memory_usage] MA 1.24 GB Max_MA 1.3 GB CA 1.89 GB Max_CA 2 GB +[default0]:[2022-10-06 12:43:17,513] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.3 GB, percent = 7.2% +[default0]:[2022-10-06 12:43:17,513] [INFO] [stage_1_and_2.py:516:__init__] optimizer state initialized +[default0]:[2022-10-06 12:43:17,540] [INFO] [utils.py:827:see_memory_usage] After initializing ZeRO optimizer +[default0]:[2022-10-06 12:43:17,540] [INFO] [utils.py:828:see_memory_usage] MA 1.24 GB Max_MA 1.24 GB CA 1.89 GB Max_CA 2 GB +[default0]:[2022-10-06 12:43:17,540] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.3 GB, percent = 7.2% +[default0]:[2022-10-06 12:43:17,540] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[default0]:[2022-10-06 12:43:17,540] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[default0]:[2022-10-06 12:43:17,540] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[default0]:[2022-10-06 12:43:17,540] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:987:print] DeepSpeedEngine configuration: +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:991:print] activation_checkpointing_config { +[default0]: "partition_activations": false, +[default0]: "contiguous_memory_optimization": false, +[default0]: "cpu_checkpointing": false, +[default0]: "number_checkpoints": null, +[default0]: "synchronize_checkpoint_boundary": false, +[default0]: "profile": false +[default0]:} +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:991:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:991:print] amp_enabled .................. False +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:991:print] amp_params ................... False +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:991:print] autotuning_config ............ { +[default0]: "enabled": false, +[default0]: "start_step": null, +[default0]: "end_step": null, +[default0]: "metric_path": null, +[default0]: "arg_mappings": null, +[default0]: "metric": "throughput", +[default0]: "model_info": null, +[default0]: "results_dir": null, +[default0]: "exps_dir": null, +[default0]: "overwrite": true, +[default0]: "fast": true, +[default0]: "start_profile_step": 3, +[default0]: "end_profile_step": 5, +[default0]: "tuner_type": "gridsearch", +[default0]: "tuner_early_stopping": 5, +[default0]: "tuner_num_trials": 50, +[default0]: "model_info_path": null, +[default0]: "mp_size": 1, +[default0]: "max_train_batch_size": null, +[default0]: "min_train_batch_size": 1, +[default0]: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +[default0]: "min_train_micro_batch_size_per_gpu": 1, +[default0]: "num_tuning_micro_batch_sizes": 3 +[default0]:} +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:991:print] bfloat16_enabled ............. False +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:991:print] checkpoint_tag_validation_enabled True +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:991:print] checkpoint_tag_validation_fail False +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:991:print] comms_config ................. +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:991:print] communication_data_type ...... None +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:991:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:991:print] curriculum_enabled ........... False +[default0]:[2022-10-06 12:43:17,541] [INFO] [config.py:991:print] curriculum_params ............ False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] dataloader_drop_last ......... False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] disable_allgather ............ False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] dump_state ................... False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] eigenvalue_enabled ........... False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] eigenvalue_gas_boundary_resolution 1 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] eigenvalue_layer_name ........ bert.encoder.layer +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] eigenvalue_layer_num ......... 0 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] eigenvalue_max_iter .......... 100 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] eigenvalue_stability ......... 1e-06 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] eigenvalue_tol ............... 0.01 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] eigenvalue_verbose ........... False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] elasticity_enabled ........... False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] flops_profiler_config ........ { +[default0]: "enabled": false, +[default0]: "profile_step": 1, +[default0]: "module_depth": -1, +[default0]: "top_modules": 1, +[default0]: "detailed": true, +[default0]: "output_file": null +[default0]:} +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] fp16_auto_cast ............... False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] fp16_enabled ................. True +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] fp16_master_weights_and_gradients False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] global_rank .................. 0 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] gradient_accumulation_steps .. 32 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] gradient_clipping ............ 1.0 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] gradient_predivide_factor .... 1.0 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] initial_dynamic_scale ........ 4096 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] load_universal_checkpoint .... False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] loss_scale ................... 0 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] memory_breakdown ............. False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] monitor_config ............... +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] nebula_config ................ { +[default0]: "enabled": false, +[default0]: "persistent_storage_path": null, +[default0]: "persistent_time_interval": 100, +[default0]: "num_of_version_in_retention": 2, +[default0]: "enable_nebula_load": true, +[default0]: "load_path": null +[default0]:} +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] optimizer_legacy_fusion ...... False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] optimizer_name ............... None +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] optimizer_params ............. None +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] pld_enabled .................. False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] pld_params ................... False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] prescale_gradients ........... False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] scheduler_name ............... None +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] scheduler_params ............. None +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] sparse_attention ............. None +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] sparse_gradients_enabled ..... False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] steps_per_print .............. 2000 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] train_batch_size ............. 1024 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] train_micro_batch_size_per_gpu 1 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] wall_clock_breakdown ......... False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] world_size ................... 32 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] zero_allow_untested_optimizer False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] zero_enabled ................. True +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:991:print] zero_optimization_stage ...... 1 +[default0]:[2022-10-06 12:43:17,542] [INFO] [config.py:976:print_user_config] json = { +[default0]: "train_micro_batch_size_per_gpu": 1, +[default0]: "train_batch_size": 1.024000e+03, +[default0]: "gradient_clipping": 1.0, +[default0]: "zero_optimization": { +[default0]: "stage": 1 +[default0]: }, +[default0]: "fp16": { +[default0]: "enabled": true, +[default0]: "loss_scale": 0, +[default0]: "loss_scale_window": 500, +[default0]: "hysteresis": 2, +[default0]: "min_loss_scale": 1, +[default0]: "initial_scale_power": 12 +[default0]: }, +[default0]: "steps_per_print": 2.000000e+03, +[default0]: "wall_clock_breakdown": false +[default0]:} +[default0]:Using /gpfs7kw/linkhome/rech/genhug01/unj46ad/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +[default0]:No modifications detected for re-loaded extension module utils, skipping build step... +[default0]:Loading extension module utils... +[default0]:Time to load utils op: 0.00042366981506347656 seconds +[default0]:[2022-10-06 12:43:17,543] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=32 micro_batch_size=1 +[default0]:[2022-10-06 12:43:17,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:17,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:43:17,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:43:17,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:17,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:43:17,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:17,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:17,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:17,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:43:17,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:17,679] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=31 [0, 31) STAGE_PARAMS=559214592 (559.215M) TOTAL_PARAMS=559214592 (559.215M) UNIQUE_PARAMS=559214592 (559.215M) +[default0]:[2022-10-06 12:43:17,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:43:17,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:43:17,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:17,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:43:17,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:43:17,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:17,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default4]:[2022-10-06 12:43:17,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default4]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:17,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default2]:[2022-10-06 12:43:17,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default2]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default5]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default5]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:17,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default1]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:17,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:43:17,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:43:17,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:17,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default3]:[2022-10-06 12:43:17,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default3]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:17,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default6]:[2022-10-06 12:43:17,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default6]:[2022-10-06 12:43:17,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:17,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:43:17,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt... +[default7]:[2022-10-06 12:43:17,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/mp_rank_00_model_states.pt. +[default7]:[2022-10-06 12:43:17,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:17,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:17,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:17,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:17,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:17,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:17,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:17,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:17,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:17,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:17,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:17,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:17,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:17,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:17,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:17,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:17,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:17,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:17,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:17,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:17,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:17,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:17,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:17,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:17,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:17,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:17,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:17,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:17,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:17,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:17,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:17,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:17,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:17,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:17,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:17,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:17,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:17,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:17,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:17,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:17,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:17,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:17,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:17,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:17,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:17,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:17,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:17,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:17,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:17,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:17,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:17,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:17,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:17,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:17,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:17,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:17,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:17,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:17,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:17,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:17,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:17,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:17,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:17,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:17,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:17,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:17,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:17,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:17,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:17,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:17,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:17,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:17,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:17,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:17,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:17,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:17,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:17,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:17,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,117] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,117] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,118] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,118] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,118] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,118] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_01-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,299] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,330] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_03-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_04-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_05-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_06-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_07-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_08-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_09-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_10-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,815] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_11-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_12-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:18,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:18,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_13-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:18,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:18,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:18,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:18,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:18,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:18,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:18,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:18,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:18,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:18,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:18,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:18,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:18,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:18,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_14-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,118] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_15-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_16-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,299] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,326] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,326] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:43:19,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:43:19,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:43:19,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:43:19,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_17-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_18-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_19-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default0]: > using checkpoint value 2e-05 for learning rate +[default0]: > using checkpoint value 0.0 for minimum learning rate +[default0]: > using checkpoint value 0 for warmup iterations +[default0]: > using checkpoint value 6348800 for total number of iterations +[default0]: > using checkpoint value constant for decay style +[default0]:[2022-10-06 12:43:19,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:43:19,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:43:19,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:43:19,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:43:19,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_20-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_21-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:43:19,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,416] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:43:19,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:43:19,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:43:19,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:43:19,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:43:19,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:43:19,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:43:19,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:43:19,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 12:43:19,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:43:19,553] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 0 +[default2]:[2022-10-06 12:43:19,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 12:43:19,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:43:19,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:43:19,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:43:19,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:43:19,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:43:19,517] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 10 +[default4]:[2022-10-06 12:43:19,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:43:19,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default2]:[2022-10-06 12:43:19,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default2]:[2022-10-06 12:43:19,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 12:43:19,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:43:19,516] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 5 +[default5]:[2022-10-06 12:43:19,583] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 5 +[default0]:[2022-10-06 12:43:19,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 12:43:19,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:43:19,562] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 11 +[default1]:[2022-10-06 12:43:19,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:43:19,552] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 9 +[default5]:[2022-10-06 12:43:19,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:43:19,574] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 13 +[default6]:[2022-10-06 12:43:19,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_22-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt... +[default3]:[2022-10-06 12:43:19,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:43:19,577] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 3 +[default6]:[2022-10-06 12:43:19,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:43:19,523] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 7 +[default7]:[2022-10-06 12:43:19,587] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 7 +[default0]:[2022-10-06 12:43:19,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:43:19,636] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 24 +[default3]:[2022-10-06 12:43:19,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:43:19,655] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 27 +[default5]:[2022-10-06 12:43:19,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:43:19,648] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 29 +[default7]:[2022-10-06 12:43:19,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:43:19,616] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 31 +[default7]:[2022-10-06 12:43:19,648] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 31 +[default1]:[2022-10-06 12:43:19,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:43:19,592] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 0 +[default0]: checkpoint version 3.0 +[default2]:[2022-10-06 12:43:19,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:43:19,630] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 18 +[default7]:[2022-10-06 12:43:19,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default7]:[2022-10-06 12:43:19,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 12:43:19,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 12:43:19,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default7]:[2022-10-06 12:43:19,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:43:19,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 12:43:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_23-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_24-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:43:19,668] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 22 +[default5]:[2022-10-06 12:43:19,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_25-model_00-model_states.pt. +[default3]:[2022-10-06 12:43:19,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 12:43:19,633] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 19 +[default5]:[2022-10-06 12:43:19,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 12:43:19,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:43:19,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default1]:[2022-10-06 12:43:19,627] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 17 +[default1]:[2022-10-06 12:43:19,679] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 17 +[default5]:[2022-10-06 12:43:19,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default5]:[2022-10-06 12:43:19,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default5]:[2022-10-06 12:43:19,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 12:43:19,587] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 10 +[default2]:[2022-10-06 12:43:19,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:43:19,672] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 2 +[default6]:[2022-10-06 12:43:19,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:43:19,594] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 14 +[default6]:[2022-10-06 12:43:19,633] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 14 +[default3]:[2022-10-06 12:43:19,601] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 11 +[default1]:[2022-10-06 12:43:19,595] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 9 +[default5]:[2022-10-06 12:43:19,618] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 13 +[default3]:[2022-10-06 12:43:19,610] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 3 +[default6]:[2022-10-06 12:43:19,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default6]:[2022-10-06 12:43:19,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default6]:[2022-10-06 12:43:19,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 12:43:19,671] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 24 +[default2]:[2022-10-06 12:43:19,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 12:43:19,738] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 26 +[default3]:[2022-10-06 12:43:19,694] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 27 +[default4]:[2022-10-06 12:43:19,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:43:19,763] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 28 +[default5]:[2022-10-06 12:43:19,686] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 29 +[default1]:[2022-10-06 12:43:19,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:43:19,724] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 25 +[default1]:[2022-10-06 12:43:19,758] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 25 +[default1]:[2022-10-06 12:43:19,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 12:43:19,735] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 1 +[default1]:[2022-10-06 12:43:19,767] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 1 +[default2]:[2022-10-06 12:43:19,695] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 18 +[default7]:[2022-10-06 12:43:19,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:43:19,726] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 23 +[default7]:[2022-10-06 12:43:19,760] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 23 +[default0]:[2022-10-06 12:43:19,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:43:19,729] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 16 +[default0]:[2022-10-06 12:43:19,762] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 16 +[default4]:[2022-10-06 12:43:19,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_26-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt... +[default4]:[2022-10-06 12:43:19,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/layer_28-model_00-model_states.pt. +[default4]:[2022-10-06 12:43:19,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 12:43:19,708] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 22 +[default3]:[2022-10-06 12:43:19,694] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 19 +[default4]:[2022-10-06 12:43:19,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:43:19,701] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 12 +[default4]:[2022-10-06 12:43:19,735] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 12 +[default4]:[2022-10-06 12:43:19,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:43:19,708] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 4 +[default4]:[2022-10-06 12:43:19,750] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 4 +[default0]:[2022-10-06 12:43:19,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 12:43:19,723] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 8 +[default0]:[2022-10-06 12:43:19,764] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 8 +[default7]:[2022-10-06 12:43:19,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 12:43:19,727] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 15 +[default7]:[2022-10-06 12:43:19,768] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 15 +[default2]:[2022-10-06 12:43:19,706] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 2 +[default6]:[2022-10-06 12:43:19,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:43:19,696] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 6 +[default6]:[2022-10-06 12:43:19,734] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 6 +[default2]:[2022-10-06 12:43:19,777] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 26 +[default4]:[2022-10-06 12:43:19,798] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 28 +[default6]:[2022-10-06 12:43:19,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 12:43:19,770] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 30 +[default6]:[2022-10-06 12:43:19,802] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 30 +[default7]:time (ms) | load-checkpoint: 2151.50 +[default0]: successfully loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq at iteration 10 +[default4]:[2022-10-06 12:43:19,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 12:43:19,793] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 20 +[default4]:[2022-10-06 12:43:19,833] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 20 +[default5]:[2022-10-06 12:43:19,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step10/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 12:43:19,785] [INFO] [engine.py:2833:_get_all_zero_checkpoint_state_dicts] successfully read 32 ZeRO state_dicts for rank 21 +[default5]:[2022-10-06 12:43:19,821] [INFO] [engine.py:2767:_load_zero_checkpoint] loading 32 zero partition checkpoints for rank 21 +[default0]:estimated model parameters: 0.559214592 +[default0]:estimated model parameters without embeddings: 0.302313472 +[default0]:[after model, optimizer, and learning rate scheduler are built] datetime: 2022-10-06 12:43:19 +[default0]:> building train, validation, and test datasets ... +[default0]: > datasets target sizes (minimum size): +[default0]: train: 6348800 +[default0]: validation: 512000 +[default0]: test: 10240 +[default0]:> building train, validation, and test datasets for T0 ... +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010511 seconds +[default0]: number of documents: 31495184 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 29920425) total of 29920425 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.019758 seconds +[default0]: number of documents: 31495184 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002412 seconds +[default0]: number of documents: 31495184 +[default0]:/gpfsssd/worksf/projects/rech/six/commun/code/tr13f-6B3-ml-t0/megdslossseqnew350m/Megatron-DeepSpeed/megatron/utils.py:365: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +[default0]: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_train_indexmap_2470317ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_train_indexmap_2470317ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.053 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005297 seconds +[default0]: number of documents: 5151349 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 4893782) total of 4893782 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005486 seconds +[default0]: number of documents: 5151349 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002663 seconds +[default0]: number of documents: 5151349 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_train_indexmap_501992ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_train_indexmap_501992ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.031 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013981 seconds +[default0]: number of documents: 3562772 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 3384633) total of 3384633 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.017845 seconds +[default0]: number of documents: 3562772 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004148 seconds +[default0]: number of documents: 3562772 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_train_indexmap_406376ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_train_indexmap_406376ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.033 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010356 seconds +[default0]: number of documents: 2707724 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2572338) total of 2572338 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013309 seconds +[default0]: number of documents: 2707724 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004424 seconds +[default0]: number of documents: 2707724 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_train_indexmap_373254ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_train_indexmap_373254ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004550 seconds +[default0]: number of documents: 5055942 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 4803145) total of 4803145 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.028534 seconds +[default0]: number of documents: 5055942 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005228 seconds +[default0]: number of documents: 5055942 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_train_indexmap_367732ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_train_indexmap_367732ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007270 seconds +[default0]: number of documents: 2148955 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2041507) total of 2041507 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.047247 seconds +[default0]: number of documents: 2148955 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005500 seconds +[default0]: number of documents: 2148955 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_train_indexmap_310091ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_train_indexmap_310091ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.022 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007343 seconds +[default0]: number of documents: 2627392 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 2496022) total of 2496022 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010529 seconds +[default0]: number of documents: 2627392 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002998 seconds +[default0]: number of documents: 2627392 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_train_indexmap_305386ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_train_indexmap_305386ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.027 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010496 seconds +[default0]: number of documents: 3560556 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 3382528) total of 3382528 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.022007 seconds +[default0]: number of documents: 3560556 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003237 seconds +[default0]: number of documents: 3560556 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_train_indexmap_304299ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_train_indexmap_304299ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.029 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.061038 seconds +[default0]: number of documents: 1543441 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1466269) total of 1466269 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012336 seconds +[default0]: number of documents: 1543441 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004833 seconds +[default0]: number of documents: 1543441 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_train_indexmap_291291ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_train_indexmap_291291ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.028 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009668 seconds +[default0]: number of documents: 1667306 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1583941) total of 1583941 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013481 seconds +[default0]: number of documents: 1667306 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004331 seconds +[default0]: number of documents: 1667306 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_train_indexmap_205616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_train_indexmap_205616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.017 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011662 seconds +[default0]: number of documents: 855756 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 812968) total of 812968 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.018021 seconds +[default0]: number of documents: 855756 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003254 seconds +[default0]: number of documents: 855756 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_train_indexmap_127177ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_train_indexmap_127177ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.008 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006933 seconds +[default0]: number of documents: 573364 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 544696) total of 544696 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.019674 seconds +[default0]: number of documents: 573364 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001513 seconds +[default0]: number of documents: 573364 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_train_indexmap_88369ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_train_indexmap_88369ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.036 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004625 seconds +[default0]: number of documents: 410633 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 390101) total of 390101 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005914 seconds +[default0]: number of documents: 410633 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001648 seconds +[default0]: number of documents: 410633 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_train_indexmap_61292ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_train_indexmap_61292ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.035 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004803 seconds +[default0]: number of documents: 428843 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 407401) total of 407401 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004400 seconds +[default0]: number of documents: 428843 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001968 seconds +[default0]: number of documents: 428843 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_train_indexmap_55208ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_train_indexmap_55208ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.047 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005506 seconds +[default0]: number of documents: 417269 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 396406) total of 396406 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008023 seconds +[default0]: number of documents: 417269 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001237 seconds +[default0]: number of documents: 417269 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_train_indexmap_44171ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_train_indexmap_44171ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.037 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.030560 seconds +[default0]: number of documents: 1114455 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 1058732) total of 1058732 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011635 seconds +[default0]: number of documents: 1114455 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003700 seconds +[default0]: number of documents: 1114455 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_train_indexmap_37186ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_train_indexmap_37186ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.011 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002764 seconds +[default0]: number of documents: 347499 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 330124) total of 330124 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004922 seconds +[default0]: number of documents: 347499 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001567 seconds +[default0]: number of documents: 347499 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_train_indexmap_37173ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_train_indexmap_37173ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004538 seconds +[default0]: number of documents: 339210 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 322250) total of 322250 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005298 seconds +[default0]: number of documents: 339210 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001169 seconds +[default0]: number of documents: 339210 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_train_indexmap_34691ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_train_indexmap_34691ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004455 seconds +[default0]: number of documents: 315754 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 299966) total of 299966 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005042 seconds +[default0]: number of documents: 315754 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001379 seconds +[default0]: number of documents: 315754 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_train_indexmap_26100ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_train_indexmap_26100ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.009 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.023793 seconds +[default0]: number of documents: 918416 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 872495) total of 872495 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005217 seconds +[default0]: number of documents: 918416 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004587 seconds +[default0]: number of documents: 918416 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_train_indexmap_23389ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_train_indexmap_23389ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.011 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009032 seconds +[default0]: number of documents: 950097 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 902592) total of 902592 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.044114 seconds +[default0]: number of documents: 950097 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003955 seconds +[default0]: number of documents: 950097 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_train_indexmap_21563ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_train_indexmap_21563ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.033 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011436 seconds +[default0]: number of documents: 915063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869310) total of 869310 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010425 seconds +[default0]: number of documents: 915063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004295 seconds +[default0]: number of documents: 915063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_train_indexmap_18042ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_train_indexmap_18042ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.012 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010728 seconds +[default0]: number of documents: 915061 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869308) total of 869308 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009814 seconds +[default0]: number of documents: 915061 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005014 seconds +[default0]: number of documents: 915061 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_train_indexmap_17484ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_train_indexmap_17484ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.009 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008181 seconds +[default0]: number of documents: 915058 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869305) total of 869305 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013087 seconds +[default0]: number of documents: 915058 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003071 seconds +[default0]: number of documents: 915058 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_train_indexmap_16885ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_train_indexmap_16885ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.011 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011383 seconds +[default0]: number of documents: 865056 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 821803) total of 821803 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014205 seconds +[default0]: number of documents: 865056 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004206 seconds +[default0]: number of documents: 865056 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_train_indexmap_16740ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_train_indexmap_16740ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.038 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011839 seconds +[default0]: number of documents: 915044 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869292) total of 869292 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009964 seconds +[default0]: number of documents: 915044 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002766 seconds +[default0]: number of documents: 915044 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_train_indexmap_16592ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_train_indexmap_16592ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.035 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008548 seconds +[default0]: number of documents: 915043 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869291) total of 869291 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011258 seconds +[default0]: number of documents: 915043 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003129 seconds +[default0]: number of documents: 915043 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_train_indexmap_16532ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_train_indexmap_16532ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.019 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.015516 seconds +[default0]: number of documents: 915021 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869270) total of 869270 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012449 seconds +[default0]: number of documents: 915021 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006034 seconds +[default0]: number of documents: 915021 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_train_indexmap_15642ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_train_indexmap_15642ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.043 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010088 seconds +[default0]: number of documents: 915054 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869301) total of 869301 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012544 seconds +[default0]: number of documents: 915054 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003671 seconds +[default0]: number of documents: 915054 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_train_indexmap_15616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_train_indexmap_15616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.009 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009594 seconds +[default0]: number of documents: 915051 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 869298) total of 869298 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011327 seconds +[default0]: number of documents: 915051 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004070 seconds +[default0]: number of documents: 915051 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_train_indexmap_15230ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_train_indexmap_15230ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.024 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002888 seconds +[default0]: number of documents: 318189 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 302280) total of 302280 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004670 seconds +[default0]: number of documents: 318189 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001218 seconds +[default0]: number of documents: 318189 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_train_indexmap_12795ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_train_indexmap_12795ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.021 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004618 seconds +[default0]: number of documents: 265864 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 252571) total of 252571 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004247 seconds +[default0]: number of documents: 265864 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001135 seconds +[default0]: number of documents: 265864 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_train_indexmap_11605ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_train_indexmap_11605ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001508 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004472 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000949 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_train_indexmap_10970ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_train_indexmap_10970ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005716 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.008276 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001034 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_train_indexmap_10706ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_train_indexmap_10706ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.039 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005668 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004681 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000991 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_train_indexmap_10360ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_train_indexmap_10360ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006053 seconds +[default0]: number of documents: 365060 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 346807) total of 346807 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005325 seconds +[default0]: number of documents: 365060 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001366 seconds +[default0]: number of documents: 365060 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_train_indexmap_8374ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_train_indexmap_8374ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005626 seconds +[default0]: number of documents: 365063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 346810) total of 346810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005591 seconds +[default0]: number of documents: 365063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001235 seconds +[default0]: number of documents: 365063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_train_indexmap_8126ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_train_indexmap_8126ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.029 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004914 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004812 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001165 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_train_indexmap_7693ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_train_indexmap_7693ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004750 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251921) total of 251921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007837 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001035 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_train_indexmap_7627ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_train_indexmap_7627ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.024 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005447 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004802 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000950 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_train_indexmap_7564ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_train_indexmap_7564ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005542 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251810) total of 251810 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004436 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000943 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_train_indexmap_7497ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_train_indexmap_7497ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005453 seconds +[default0]: number of documents: 271191 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 257631) total of 257631 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005251 seconds +[default0]: number of documents: 271191 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001319 seconds +[default0]: number of documents: 271191 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_train_indexmap_7334ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_train_indexmap_7334ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004927 seconds +[default0]: number of documents: 269973 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 256474) total of 256474 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.032118 seconds +[default0]: number of documents: 269973 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000786 seconds +[default0]: number of documents: 269973 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_train_indexmap_7168ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_train_indexmap_7168ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004624 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251817) total of 251817 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005421 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001043 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_train_indexmap_7167ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_train_indexmap_7167ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.040 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004816 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251921) total of 251921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004576 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001226 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_train_indexmap_7098ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_train_indexmap_7098ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.021 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005370 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: train: +[default0]: document indices in [0, 251817) total of 251817 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005180 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000995 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_train_indexmap_7047ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_train_indexmap_7047ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.011 seconds +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.387164, achieved: 0.387164 +[default0]: dataset 1, input: 0.0786754, achieved: 0.0786753 +[default0]: dataset 2, input: 0.0636898, achieved: 0.0636898 +[default0]: dataset 3, input: 0.0584986, achieved: 0.0584986 +[default0]: dataset 4, input: 0.0576332, achieved: 0.0576332 +[default0]: dataset 5, input: 0.0485994, achieved: 0.0485994 +[default0]: dataset 6, input: 0.0478619, achieved: 0.0478619 +[default0]: dataset 7, input: 0.0476917, achieved: 0.0476917 +[default0]: dataset 8, input: 0.045653, achieved: 0.045653 +[default0]: dataset 9, input: 0.0322254, achieved: 0.0322254 +[default0]: dataset 10, input: 0.0199319, achieved: 0.0199319 +[default0]: dataset 11, input: 0.0138497, achieved: 0.0138497 +[default0]: dataset 12, input: 0.00960604, achieved: 0.00960602 +[default0]: dataset 13, input: 0.00865243, achieved: 0.00865244 +[default0]: dataset 14, input: 0.00692261, achieved: 0.00692258 +[default0]: dataset 15, input: 0.00582803, achieved: 0.00582806 +[default0]: dataset 16, input: 0.00582586, achieved: 0.00582586 +[default0]: dataset 17, input: 0.00543684, achieved: 0.00543682 +[default0]: dataset 18, input: 0.00409056, achieved: 0.00409057 +[default0]: dataset 19, input: 0.00366562, achieved: 0.00366564 +[default0]: dataset 20, input: 0.00337934, achieved: 0.00337937 +[default0]: dataset 21, input: 0.00282756, achieved: 0.00282753 +[default0]: dataset 22, input: 0.00274012, achieved: 0.00274012 +[default0]: dataset 23, input: 0.00264619, achieved: 0.00264622 +[default0]: dataset 24, input: 0.00262358, achieved: 0.0026236 +[default0]: dataset 25, input: 0.0026003, achieved: 0.00260032 +[default0]: dataset 26, input: 0.00259099, achieved: 0.00259097 +[default0]: dataset 27, input: 0.00245151, achieved: 0.00245155 +[default0]: dataset 28, input: 0.00244735, achieved: 0.00244736 +[default0]: dataset 29, input: 0.00238684, achieved: 0.00238686 +[default0]: dataset 30, input: 0.0020053, achieved: 0.00200525 +[default0]: dataset 31, input: 0.00181876, achieved: 0.00181879 +[default0]: dataset 32, input: 0.00171918, achieved: 0.00171917 +[default0]: dataset 33, input: 0.00167779, achieved: 0.00167776 +[default0]: dataset 34, input: 0.00162359, achieved: 0.00162355 +[default0]: dataset 35, input: 0.00131237, achieved: 0.00131238 +[default0]: dataset 36, input: 0.00127347, achieved: 0.00127344 +[default0]: dataset 37, input: 0.00120564, achieved: 0.00120569 +[default0]: dataset 38, input: 0.00119533, achieved: 0.00119529 +[default0]: dataset 39, input: 0.00118536, achieved: 0.00118536 +[default0]: dataset 40, input: 0.00117488, achieved: 0.00117487 +[default0]: dataset 41, input: 0.00114928, achieved: 0.00114929 +[default0]: dataset 42, input: 0.00112334, achieved: 0.00112334 +[default0]: dataset 43, input: 0.00112318, achieved: 0.00112315 +[default0]: dataset 44, input: 0.00111237, achieved: 0.00111236 +[default0]: dataset 45, input: 0.00110439, achieved: 0.00110444 +[default0]:> elapsed time for building blendable dataset indices: 0.64 (sec) +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.045330 seconds +[default0]: number of documents: 15234080 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [14472376, 15234080) total of 761704 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_17016ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_17016ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ar/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ar_text_document_validation_pretraining_indexmap_17016ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.054 seconds +[default0]: total number of samples: 221750 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.067874 seconds +[default0]: number of documents: 6142390 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [5835270, 6142390) total of 307120 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_5785ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_5785ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/ca/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_ca_text_document_validation_pretraining_indexmap_5785ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.069 seconds +[default0]: total number of samples: 136143 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.122961 seconds +[default0]: number of documents: 26176998 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [24868148, 26176998) total of 1308850 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_67033ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_67033ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/code/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_code_text_document_validation_pretraining_indexmap_67033ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.057 seconds +[default0]: total number of samples: 432311 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.119277 seconds +[default0]: number of documents: 20844665 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [19802432, 20844665) total of 1042233 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_114084ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_114084ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/en/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_en_text_document_validation_pretraining_indexmap_114084ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.054 seconds +[default0]: total number of samples: 521545 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049005 seconds +[default0]: number of documents: 67005817 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [63655526, 67005817) total of 3350291 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_54893ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_54893ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/es/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_es_text_document_validation_pretraining_indexmap_54893ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.065 seconds +[default0]: total number of samples: 1740321 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.098203 seconds +[default0]: number of documents: 5149795 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4892305, 5149795) total of 257490 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_803ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_803ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/eu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_eu_text_document_validation_pretraining_indexmap_803ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: total number of samples: 26370 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.051149 seconds +[default0]: number of documents: 58847091 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [55904736, 58847091) total of 2942355 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_67171ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_67171ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/fr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_fr_text_document_validation_pretraining_indexmap_67171ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.095 seconds +[default0]: total number of samples: 1458654 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.059757 seconds +[default0]: number of documents: 12514253 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [11888540, 12514253) total of 625713 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_5618ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_5618ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/id/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_id_text_document_validation_pretraining_indexmap_5618ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.025 seconds +[default0]: total number of samples: 134071 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.016239 seconds +[default0]: number of documents: 180608 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [171578, 180608) total of 9030 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_57ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_57ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-as/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-as_text_document_validation_pretraining_indexmap_57ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: total number of samples: 2501 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.055751 seconds +[default0]: number of documents: 12303134 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [11687977, 12303134) total of 615157 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_2827ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_2827ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-bn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-bn_text_document_validation_pretraining_indexmap_2827ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.065 seconds +[default0]: total number of samples: 157244 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.043344 seconds +[default0]: number of documents: 2033057 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [1931404, 2033057) total of 101653 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_207ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_207ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-gu/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-gu_text_document_validation_pretraining_indexmap_207ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: total number of samples: 20517 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.131037 seconds +[default0]: number of documents: 26793553 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [25453875, 26793553) total of 1339678 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_3844ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_3844ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-hi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-hi_text_document_validation_pretraining_indexmap_3844ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.051 seconds +[default0]: total number of samples: 101502 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.064804 seconds +[default0]: number of documents: 3155990 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2998190, 3155990) total of 157800 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_319ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_319ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-kn/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-kn_text_document_validation_pretraining_indexmap_319ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.043 seconds +[default0]: total number of samples: 44182 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.076890 seconds +[default0]: number of documents: 6692522 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [6357896, 6692522) total of 334626 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_532ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_532ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ml/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ml_text_document_validation_pretraining_indexmap_532ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: total number of samples: 47613 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.047351 seconds +[default0]: number of documents: 3017261 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2866398, 3017261) total of 150863 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_258ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_258ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-mr/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-mr_text_document_validation_pretraining_indexmap_258ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.029 seconds +[default0]: total number of samples: 29298 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.048807 seconds +[default0]: number of documents: 3648041 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [3465639, 3648041) total of 182402 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_344ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_344ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ne/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ne_text_document_validation_pretraining_indexmap_344ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: total number of samples: 5659 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.085206 seconds +[default0]: number of documents: 4327282 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4110918, 4327282) total of 216364 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_185ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_185ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-or/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-or_text_document_validation_pretraining_indexmap_185ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.007 seconds +[default0]: total number of samples: 12423 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.076406 seconds +[default0]: number of documents: 2698896 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2563951, 2698896) total of 134945 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_262ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_262ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-pa/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-pa_text_document_validation_pretraining_indexmap_262ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.026 seconds +[default0]: total number of samples: 19133 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.065269 seconds +[default0]: number of documents: 12767593 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [12129213, 12767593) total of 638380 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_1088ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_1088ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ta/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ta_text_document_validation_pretraining_indexmap_1088ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.039 seconds +[default0]: total number of samples: 87928 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.053984 seconds +[default0]: number of documents: 4342323 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [4125207, 4342323) total of 217116 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_470ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_470ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-te/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-te_text_document_validation_pretraining_indexmap_470ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.021 seconds +[default0]: total number of samples: 69780 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.053081 seconds +[default0]: number of documents: 3022722 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [2871586, 3022722) total of 151136 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_641ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_641ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/indic-ur/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_indic-ur_text_document_validation_pretraining_indexmap_641ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.006 seconds +[default0]: total number of samples: 22532 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.038760 seconds +[default0]: number of documents: 1162568 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [1104440, 1162568) total of 58128 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_163ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_163ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/nigercongo-all/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_nigercongo-all_text_document_validation_pretraining_indexmap_163ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: total number of samples: 1608 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.049445 seconds +[default0]: number of documents: 55294645 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [52529913, 55294645) total of 2764732 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_41871ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_41871ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-en/meg_ds_bigscience_tokenizer_text_document_validation_pretraining_indexmap_41871ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.037 seconds +[default0]: total number of samples: 690621 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.096472 seconds +[default0]: number of documents: 44855616 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [42612835, 44855616) total of 2242781 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_28453ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_28453ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/oscar-zh/meg_ds_bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_text_document_validation_pretraining_indexmap_28453ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.036 seconds +[default0]: total number of samples: 468689 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.022795 seconds +[default0]: number of documents: 31969891 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [30371396, 31969891) total of 1598495 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_25493ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_25493ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/pt/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_pt_text_document_validation_pretraining_indexmap_25493ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.075 seconds +[default0]: total number of samples: 497625 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.173927 seconds +[default0]: number of documents: 34110375 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [32404856, 34110375) total of 1705519 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_12667ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_12667ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/vi/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_vi_text_document_validation_pretraining_indexmap_12667ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.070 seconds +[default0]: total number of samples: 125120 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.046036 seconds +[default0]: number of documents: 43761623 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [41573542, 43761623) total of 2188081 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_62220ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_62220ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zhs/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zhs_text_document_validation_pretraining_indexmap_62220ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.135 seconds +[default0]: total number of samples: 1010592 +[default0]: total number of epochs: 1 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.034977 seconds +[default0]: number of documents: 197602 +[default0]: > dataset split: +[default0]: validation_pretraining: +[default0]: document indices in [187722, 197602) total of 9880 documents +[default0]: > loading doc-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_267ns_2048sl_42s_doc_idx.npy +[default0]: > loading sample-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_267ns_2048sl_42s_sample_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfsdsstore/projects/rech/six/commun/merged-meg-ds_v3_pii/zht/bigscience-catalogue-data-dev_byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles_zht_text_document_validation_pretraining_indexmap_267ns_2048sl_42s_shuffle_idx.npy +[default0]: loaded indexed file in 0.008 seconds +[default0]: total number of samples: 4451 +[default0]: total number of epochs: 1 +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.0330676, achieved: 0.0330676 +[default0]: dataset 1, input: 0.0112421, achieved: 0.0112421 +[default0]: dataset 2, input: 0.130272, achieved: 0.130272 +[default0]: dataset 3, input: 0.221712, achieved: 0.221712 +[default0]: dataset 4, input: 0.106678, achieved: 0.106678 +[default0]: dataset 5, input: 0.00155951, achieved: 0.00155955 +[default0]: dataset 6, input: 0.13054, achieved: 0.13054 +[default0]: dataset 7, input: 0.010918, achieved: 0.0109181 +[default0]: dataset 8, input: 0.000110214, achieved: 0.000110257 +[default0]: dataset 9, input: 0.00549238, achieved: 0.00549235 +[default0]: dataset 10, input: 0.000402122, achieved: 0.000402094 +[default0]: dataset 11, input: 0.00747007, achieved: 0.00747007 +[default0]: dataset 12, input: 0.000619047, achieved: 0.000619024 +[default0]: dataset 13, input: 0.00103353, achieved: 0.0010336 +[default0]: dataset 14, input: 0.000501201, achieved: 0.000501226 +[default0]: dataset 15, input: 0.000667277, achieved: 0.000667231 +[default0]: dataset 16, input: 0.000359281, achieved: 0.000359326 +[default0]: dataset 17, input: 0.000508443, achieved: 0.000508519 +[default0]: dataset 18, input: 0.00211373, achieved: 0.0021138 +[default0]: dataset 19, input: 0.000912995, achieved: 0.000912961 +[default0]: dataset 20, input: 0.00124543, achieved: 0.00124546 +[default0]: dataset 21, input: 0.000315887, achieved: 0.00031594 +[default0]: dataset 22, input: 0.0813721, achieved: 0.0813721 +[default0]: dataset 23, input: 0.0552939, achieved: 0.0552939 +[default0]: dataset 24, input: 0.0495415, achieved: 0.0495414 +[default0]: dataset 25, input: 0.0246164, achieved: 0.0246163 +[default0]: dataset 26, input: 0.120917, achieved: 0.120917 +[default0]: dataset 27, input: 0.000517703, achieved: 0.000517666 +[default0]:> elapsed time for building blendable dataset indices: 0.32 (sec) +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007862 seconds +[default0]: number of documents: 31495184 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [29920425, 31495184) total of 1574759 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014838 seconds +[default0]: number of documents: 31495184 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002585 seconds +[default0]: number of documents: 31495184 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_validation_indexmap_199220ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_en_validation_indexmap_199220ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.046 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.017033 seconds +[default0]: number of documents: 5151349 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [4893782, 5151349) total of 257567 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005825 seconds +[default0]: number of documents: 5151349 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002857 seconds +[default0]: number of documents: 5151349 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_validation_indexmap_40484ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_es_validation_indexmap_40484ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.048 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013896 seconds +[default0]: number of documents: 3562772 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [3384633, 3562772) total of 178139 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004376 seconds +[default0]: number of documents: 3562772 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002782 seconds +[default0]: number of documents: 3562772 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_validation_indexmap_32773ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pt_validation_indexmap_32773ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.025 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014244 seconds +[default0]: number of documents: 2707724 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2572338, 2707724) total of 135386 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004422 seconds +[default0]: number of documents: 2707724 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003040 seconds +[default0]: number of documents: 2707724 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_validation_indexmap_30102ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_code_validation_indexmap_30102ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.025 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014595 seconds +[default0]: number of documents: 5055942 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [4803145, 5055942) total of 252797 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004242 seconds +[default0]: number of documents: 5055942 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002342 seconds +[default0]: number of documents: 5055942 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_validation_indexmap_29656ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fr_validation_indexmap_29656ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.021 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.017663 seconds +[default0]: number of documents: 2148955 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2041507, 2148955) total of 107448 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004669 seconds +[default0]: number of documents: 2148955 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002481 seconds +[default0]: number of documents: 2148955 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_validation_indexmap_25008ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ar_validation_indexmap_25008ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.026 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.019820 seconds +[default0]: number of documents: 2627392 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [2496022, 2627392) total of 131370 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004694 seconds +[default0]: number of documents: 2627392 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003866 seconds +[default0]: number of documents: 2627392 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_validation_indexmap_24628ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_id_validation_indexmap_24628ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.010941 seconds +[default0]: number of documents: 3560556 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [3382528, 3560556) total of 178028 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013538 seconds +[default0]: number of documents: 3560556 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002800 seconds +[default0]: number of documents: 3560556 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_validation_indexmap_24541ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zh_validation_indexmap_24541ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.025 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006451 seconds +[default0]: number of documents: 1543441 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1466269, 1543441) total of 77172 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014181 seconds +[default0]: number of documents: 1543441 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002769 seconds +[default0]: number of documents: 1543441 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_validation_indexmap_23492ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_hi_validation_indexmap_23492ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007392 seconds +[default0]: number of documents: 1667306 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1583941, 1667306) total of 83365 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013676 seconds +[default0]: number of documents: 1667306 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002989 seconds +[default0]: number of documents: 1667306 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_validation_indexmap_16582ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_vi_validation_indexmap_16582ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.016 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.020908 seconds +[default0]: number of documents: 855756 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [812968, 855756) total of 42788 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004651 seconds +[default0]: number of documents: 855756 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003344 seconds +[default0]: number of documents: 855756 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_validation_indexmap_10257ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ur_validation_indexmap_10257ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012527 seconds +[default0]: number of documents: 573364 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [544696, 573364) total of 28668 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003581 seconds +[default0]: number of documents: 573364 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001793 seconds +[default0]: number of documents: 573364 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_validation_indexmap_7127ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_te_validation_indexmap_7127ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.020 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.007560 seconds +[default0]: number of documents: 410633 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [390101, 410633) total of 20532 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002428 seconds +[default0]: number of documents: 410633 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001210 seconds +[default0]: number of documents: 410633 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_validation_indexmap_4943ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ta_validation_indexmap_4943ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005252 seconds +[default0]: number of documents: 428843 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [407401, 428843) total of 21442 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002107 seconds +[default0]: number of documents: 428843 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001056 seconds +[default0]: number of documents: 428843 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_validation_indexmap_4453ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bn_validation_indexmap_4453ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.015 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006285 seconds +[default0]: number of documents: 417269 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [396406, 417269) total of 20863 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002240 seconds +[default0]: number of documents: 417269 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001209 seconds +[default0]: number of documents: 417269 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_validation_indexmap_3563ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_mr_validation_indexmap_3563ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.028 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011971 seconds +[default0]: number of documents: 1114455 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [1058732, 1114455) total of 55723 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004488 seconds +[default0]: number of documents: 1114455 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002498 seconds +[default0]: number of documents: 1114455 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_validation_indexmap_2999ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sw_validation_indexmap_2999ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005105 seconds +[default0]: number of documents: 347499 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [330124, 347499) total of 17375 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002004 seconds +[default0]: number of documents: 347499 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001144 seconds +[default0]: number of documents: 347499 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_validation_indexmap_2998ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_gu_validation_indexmap_2998ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002016 seconds +[default0]: number of documents: 339210 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [322250, 339210) total of 16960 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003681 seconds +[default0]: number of documents: 339210 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000776 seconds +[default0]: number of documents: 339210 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_validation_indexmap_2798ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_pa_validation_indexmap_2798ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.019 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001800 seconds +[default0]: number of documents: 315754 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [299966, 315754) total of 15788 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003505 seconds +[default0]: number of documents: 315754 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000814 seconds +[default0]: number of documents: 315754 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_validation_indexmap_2105ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ne_validation_indexmap_2105ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.009813 seconds +[default0]: number of documents: 918416 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [872495, 918416) total of 45921 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003869 seconds +[default0]: number of documents: 918416 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002310 seconds +[default0]: number of documents: 918416 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_validation_indexmap_1887ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_yo_validation_indexmap_1887ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.015002 seconds +[default0]: number of documents: 950097 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [902592, 950097) total of 47505 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014270 seconds +[default0]: number of documents: 950097 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002545 seconds +[default0]: number of documents: 950097 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_validation_indexmap_1739ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ig_validation_indexmap_1739ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.015404 seconds +[default0]: number of documents: 915063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869310, 915063) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003903 seconds +[default0]: number of documents: 915063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002434 seconds +[default0]: number of documents: 915063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_validation_indexmap_1455ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ny_validation_indexmap_1455ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.011034 seconds +[default0]: number of documents: 915061 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869308, 915061) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014700 seconds +[default0]: number of documents: 915061 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003061 seconds +[default0]: number of documents: 915061 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_validation_indexmap_1410ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_zu_validation_indexmap_1410ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014943 seconds +[default0]: number of documents: 915058 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869305, 915058) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005095 seconds +[default0]: number of documents: 915058 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002702 seconds +[default0]: number of documents: 915058 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_validation_indexmap_1362ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_xh_validation_indexmap_1362ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014050 seconds +[default0]: number of documents: 865056 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [821803, 865056) total of 43253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004863 seconds +[default0]: number of documents: 865056 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002659 seconds +[default0]: number of documents: 865056 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_validation_indexmap_1350ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_sn_validation_indexmap_1350ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005562 seconds +[default0]: number of documents: 915044 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869292, 915044) total of 45752 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013397 seconds +[default0]: number of documents: 915044 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002818 seconds +[default0]: number of documents: 915044 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_validation_indexmap_1339ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ts_validation_indexmap_1339ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.019 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.014573 seconds +[default0]: number of documents: 915043 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869291, 915043) total of 45752 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003745 seconds +[default0]: number of documents: 915043 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002518 seconds +[default0]: number of documents: 915043 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_validation_indexmap_1334ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rw_validation_indexmap_1334ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.006183 seconds +[default0]: number of documents: 915021 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869270, 915021) total of 45751 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013246 seconds +[default0]: number of documents: 915021 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002835 seconds +[default0]: number of documents: 915021 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_validation_indexmap_1262ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_lg_validation_indexmap_1262ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.018 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.012184 seconds +[default0]: number of documents: 915054 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869301, 915054) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.013995 seconds +[default0]: number of documents: 915054 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002975 seconds +[default0]: number of documents: 915054 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_validation_indexmap_1260ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tn_validation_indexmap_1260ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.019459 seconds +[default0]: number of documents: 915051 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [869298, 915051) total of 45753 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004261 seconds +[default0]: number of documents: 915051 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002596 seconds +[default0]: number of documents: 915051 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_validation_indexmap_1229ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_nso_validation_indexmap_1229ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002209 seconds +[default0]: number of documents: 318189 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [302280, 318189) total of 15909 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.004376 seconds +[default0]: number of documents: 318189 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001087 seconds +[default0]: number of documents: 318189 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_validation_indexmap_1032ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_rn_validation_indexmap_1032ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003338 seconds +[default0]: number of documents: 265864 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [252571, 265864) total of 13293 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003775 seconds +[default0]: number of documents: 265864 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000763 seconds +[default0]: number of documents: 265864 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_validation_indexmap_936ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ml_validation_indexmap_936ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.005602 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002800 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000786 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_validation_indexmap_885ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_kn_validation_indexmap_885ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003366 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002466 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000764 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_validation_indexmap_864ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_or_validation_indexmap_864ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001990 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002800 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000847 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_validation_indexmap_836ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_as_validation_indexmap_836ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002658 seconds +[default0]: number of documents: 365060 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [346807, 365060) total of 18253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002963 seconds +[default0]: number of documents: 365060 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001124 seconds +[default0]: number of documents: 365060 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_validation_indexmap_676ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ln_validation_indexmap_676ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002384 seconds +[default0]: number of documents: 365063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [346810, 365063) total of 18253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003307 seconds +[default0]: number of documents: 365063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000893 seconds +[default0]: number of documents: 365063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_validation_indexmap_656ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_wo_validation_indexmap_656ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.002 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003501 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001460 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000764 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_validation_indexmap_621ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tum_validation_indexmap_621ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002012 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251921, 265180) total of 13259 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001251 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000709 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_validation_indexmap_616ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ki_validation_indexmap_616ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002053 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001226 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000677 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_validation_indexmap_610ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_st_validation_indexmap_610ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003432 seconds +[default0]: number of documents: 265063 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251810, 265063) total of 13253 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002891 seconds +[default0]: number of documents: 265063 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000783 seconds +[default0]: number of documents: 265063 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_validation_indexmap_605ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_fon_validation_indexmap_605ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.002145 seconds +[default0]: number of documents: 271191 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [257631, 271191) total of 13560 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001481 seconds +[default0]: number of documents: 271191 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000735 seconds +[default0]: number of documents: 271191 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_validation_indexmap_592ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ca_validation_indexmap_592ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003479 seconds +[default0]: number of documents: 269973 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [256474, 269973) total of 13499 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001277 seconds +[default0]: number of documents: 269973 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000775 seconds +[default0]: number of documents: 269973 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_validation_indexmap_579ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_eu_validation_indexmap_579ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001987 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251817, 265071) total of 13254 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001253 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000768 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_validation_indexmap_578ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_ak_validation_indexmap_578ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.005 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003602 seconds +[default0]: number of documents: 265180 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251921, 265180) total of 13259 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001339 seconds +[default0]: number of documents: 265180 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000660 seconds +[default0]: number of documents: 265180 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_validation_indexmap_573ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_bm_validation_indexmap_573ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.004 seconds +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.003192 seconds +[default0]: number of documents: 265071 +[default0]: > dataset split: +[default0]: validation: +[default0]: document indices in [251817, 265071) total of 13254 documents +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.001391 seconds +[default0]: number of documents: 265071 +[default0]: > building dataset index ... +[default0]: reading sizes... +[default0]: reading pointers... +[default0]: reading document index... +[default0]: creating numpy buffer of mmap... +[default0]: creating memory view of numpy buffer... +[default0]: > finished creating indexed dataset in 0.000776 seconds +[default0]: number of documents: 265071 +[default0]: > loading doc-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_validation_indexmap_569ns_42s_decoder_packed_batch_idx.npy +[default0]: > loading shuffle-idx mapping from /gpfswork/rech/six/commun/bigscience-training/xp3cappedmixednewcodelong/xp3_tw_validation_indexmap_569ns_42s_decoder_packed_shuffle_idx.npy +[default0]: loaded indexed file in 0.003 seconds +[default0]:> building indices for blendable datasets ... +[default0]: > sample ratios: +[default0]: dataset 0, input: 0.387164, achieved: 0.387163 +[default0]: dataset 1, input: 0.0786754, achieved: 0.0786745 +[default0]: dataset 2, input: 0.0636898, achieved: 0.0636904 +[default0]: dataset 3, input: 0.0584986, achieved: 0.0584976 +[default0]: dataset 4, input: 0.0576332, achieved: 0.0576328 +[default0]: dataset 5, input: 0.0485994, achieved: 0.0485991 +[default0]: dataset 6, input: 0.0478619, achieved: 0.0478619 +[default0]: dataset 7, input: 0.0476917, achieved: 0.0476916 +[default0]: dataset 8, input: 0.045653, achieved: 0.0456537 +[default0]: dataset 9, input: 0.0322254, achieved: 0.0322257 +[default0]: dataset 10, input: 0.0199319, achieved: 0.0199317 +[default0]: dataset 11, input: 0.0138497, achieved: 0.0138502 +[default0]: dataset 12, input: 0.00960604, achieved: 0.00960574 +[default0]: dataset 13, input: 0.00865243, achieved: 0.00865232 +[default0]: dataset 14, input: 0.00692261, achieved: 0.00692254 +[default0]: dataset 15, input: 0.00582803, achieved: 0.0058278 +[default0]: dataset 16, input: 0.00582586, achieved: 0.0058261 +[default0]: dataset 17, input: 0.00543684, achieved: 0.00543622 +[default0]: dataset 18, input: 0.00409056, achieved: 0.00409121 +[default0]: dataset 19, input: 0.00366562, achieved: 0.00366557 +[default0]: dataset 20, input: 0.00337934, achieved: 0.00337955 +[default0]: dataset 21, input: 0.00282756, achieved: 0.00282792 +[default0]: dataset 22, input: 0.00274012, achieved: 0.00273939 +[default0]: dataset 23, input: 0.00264619, achieved: 0.00264575 +[default0]: dataset 24, input: 0.00262358, achieved: 0.00262362 +[default0]: dataset 25, input: 0.0026003, achieved: 0.00259978 +[default0]: dataset 26, input: 0.00259099, achieved: 0.00259127 +[default0]: dataset 27, input: 0.00245151, achieved: 0.00245166 +[default0]: dataset 28, input: 0.00244735, achieved: 0.00244826 +[default0]: dataset 29, input: 0.00238684, achieved: 0.00238696 +[default0]: dataset 30, input: 0.0020053, achieved: 0.00200559 +[default0]: dataset 31, input: 0.00181876, achieved: 0.00181831 +[default0]: dataset 32, input: 0.00171918, achieved: 0.00171957 +[default0]: dataset 33, input: 0.00167779, achieved: 0.00167871 +[default0]: dataset 34, input: 0.00162359, achieved: 0.00162423 +[default0]: dataset 35, input: 0.00131237, achieved: 0.00131266 +[default0]: dataset 36, input: 0.00127347, achieved: 0.0012735 +[default0]: dataset 37, input: 0.00120564, achieved: 0.0012054 +[default0]: dataset 38, input: 0.00119533, achieved: 0.00119518 +[default0]: dataset 39, input: 0.00118536, achieved: 0.00118497 +[default0]: dataset 40, input: 0.00117488, achieved: 0.00117475 +[default0]: dataset 41, input: 0.00114928, achieved: 0.00114922 +[default0]: dataset 42, input: 0.00112334, achieved: 0.00112368 +[default0]: dataset 43, input: 0.00112318, achieved: 0.00112368 +[default0]: dataset 44, input: 0.00111237, achieved: 0.00111176 +[default0]: dataset 45, input: 0.00110439, achieved: 0.00110495 +[default0]:> elapsed time for building blendable dataset indices: 0.03 (sec) +[default0]:> finished creating T0 datasets ... +[default0]:[after dataloaders are built] datetime: 2022-10-06 12:43:32 +[default0]:done with setup ... +[default0]:training ... +[default0]:Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +[default0]:[000-000] 0.5592B / 0.3023B +[default0]:[before the start of training step] datetime: 2022-10-06 12:43:32 +[default0]:[2022-10-06 12:43:32,590] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information +[default0]:[2022-10-06 12:43:32,590] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False +[default0]:[2022-10-06 12:43:32,590] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 24 total layers +[default0]:[2022-10-06 12:43:32,590] [INFO] [checkpointing.py:554:forward] ----Synchronization False +[default0]:[2022-10-06 12:43:32,590] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False +[default7]:time (ms) | model-and-optimizer-setup: 8563.87 | train/valid/test-data-iterators-setup: 12518.47 +[default0]:[Rank 0] (after 11 iterations) memory (MB) | allocated: 5196.2685546875 | max allocated: 9317.00927734375 | reserved: 10698.0 | max reserved: 10698.0 +[default7]: iteration 11/ 6200 | consumed samples: 11264 | consumed tokens: 23068672 | elapsed time per iteration (s): 8.83 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 3.188134E+00 | loss scale: 512.0 | grad norm: 22.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 116.001 | TFLOPs: 35.36 | +[default7]: iteration 12/ 6200 | consumed samples: 12288 | consumed tokens: 25165824 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 3.105157E+00 | loss scale: 512.0 | grad norm: 41.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.799 | TFLOPs: 42.00 | +[default7]: iteration 13/ 6200 | consumed samples: 13312 | consumed tokens: 27262976 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 3.058267E+00 | loss scale: 512.0 | grad norm: 20.798 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.752 | TFLOPs: 41.99 | +[default7]: iteration 14/ 6200 | consumed samples: 14336 | consumed tokens: 29360128 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 3.038145E+00 | loss scale: 512.0 | grad norm: 24.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.410 | TFLOPs: 41.88 | +[default7]: iteration 15/ 6200 | consumed samples: 15360 | consumed tokens: 31457280 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 3.001483E+00 | loss scale: 512.0 | grad norm: 31.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.597 | TFLOPs: 41.94 | +[default7]: iteration 16/ 6200 | consumed samples: 16384 | consumed tokens: 33554432 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.960769E+00 | loss scale: 512.0 | grad norm: 23.905 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.182 | TFLOPs: 41.81 | +[default7]: iteration 17/ 6200 | consumed samples: 17408 | consumed tokens: 35651584 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.906392E+00 | loss scale: 512.0 | grad norm: 10.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.705 | TFLOPs: 41.97 | +[default7]: iteration 18/ 6200 | consumed samples: 18432 | consumed tokens: 37748736 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.878150E+00 | loss scale: 512.0 | grad norm: 18.906 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.799 | TFLOPs: 42.00 | +[default7]: iteration 19/ 6200 | consumed samples: 19456 | consumed tokens: 39845888 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.853431E+00 | loss scale: 512.0 | grad norm: 15.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.746 | TFLOPs: 41.99 | +[default7]: iteration 20/ 6200 | consumed samples: 20480 | consumed tokens: 41943040 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.830761E+00 | loss scale: 512.0 | grad norm: 12.783 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.028 | TFLOPs: 41.77 | +[default7]: iteration 21/ 6200 | consumed samples: 21504 | consumed tokens: 44040192 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.824716E+00 | loss scale: 512.0 | grad norm: 18.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.418 | TFLOPs: 41.89 | +[default7]: iteration 22/ 6200 | consumed samples: 22528 | consumed tokens: 46137344 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.778743E+00 | loss scale: 512.0 | grad norm: 10.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.121 | TFLOPs: 41.80 | +[default7]: iteration 23/ 6200 | consumed samples: 23552 | consumed tokens: 48234496 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.799567E+00 | loss scale: 512.0 | grad norm: 13.634 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.363 | TFLOPs: 41.87 | +[default7]: iteration 24/ 6200 | consumed samples: 24576 | consumed tokens: 50331648 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.778329E+00 | loss scale: 512.0 | grad norm: 12.683 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.720 | TFLOPs: 41.98 | +[default7]: iteration 25/ 6200 | consumed samples: 25600 | consumed tokens: 52428800 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.763761E+00 | loss scale: 512.0 | grad norm: 8.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.518 | TFLOPs: 41.92 | +[default7]: iteration 26/ 6200 | consumed samples: 26624 | consumed tokens: 54525952 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.735408E+00 | loss scale: 512.0 | grad norm: 15.635 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.802 | TFLOPs: 41.70 | +[default7]: iteration 27/ 6200 | consumed samples: 27648 | consumed tokens: 56623104 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.706975E+00 | loss scale: 512.0 | grad norm: 9.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.111 | TFLOPs: 41.79 | +[default7]: iteration 28/ 6200 | consumed samples: 28672 | consumed tokens: 58720256 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.695881E+00 | loss scale: 512.0 | grad norm: 10.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.342 | TFLOPs: 41.86 | +[default7]: iteration 29/ 6200 | consumed samples: 29696 | consumed tokens: 60817408 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.716398E+00 | loss scale: 512.0 | grad norm: 11.898 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.974 | TFLOPs: 41.75 | +[default7]: iteration 30/ 6200 | consumed samples: 30720 | consumed tokens: 62914560 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.668025E+00 | loss scale: 512.0 | grad norm: 7.793 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.212 | TFLOPs: 41.82 | +[default7]: iteration 31/ 6200 | consumed samples: 31744 | consumed tokens: 65011712 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.683297E+00 | loss scale: 512.0 | grad norm: 15.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.860 | TFLOPs: 41.72 | +[default7]: iteration 32/ 6200 | consumed samples: 32768 | consumed tokens: 67108864 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.679398E+00 | loss scale: 512.0 | grad norm: 9.971 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.075 | TFLOPs: 41.78 | +[default7]: iteration 33/ 6200 | consumed samples: 33792 | consumed tokens: 69206016 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.687909E+00 | loss scale: 512.0 | grad norm: 9.594 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.866 | TFLOPs: 41.72 | +[default7]: iteration 34/ 6200 | consumed samples: 34816 | consumed tokens: 71303168 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.633570E+00 | loss scale: 512.0 | grad norm: 18.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.435 | TFLOPs: 41.89 | +[default7]: iteration 35/ 6200 | consumed samples: 35840 | consumed tokens: 73400320 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.679142E+00 | loss scale: 512.0 | grad norm: 9.618 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.163 | TFLOPs: 41.81 | +[default7]: iteration 36/ 6200 | consumed samples: 36864 | consumed tokens: 75497472 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.627513E+00 | loss scale: 512.0 | grad norm: 12.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.646 | TFLOPs: 41.96 | +[default7]: iteration 37/ 6200 | consumed samples: 37888 | consumed tokens: 77594624 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.636409E+00 | loss scale: 512.0 | grad norm: 11.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.305 | TFLOPs: 41.85 | +[default7]: iteration 38/ 6200 | consumed samples: 38912 | consumed tokens: 79691776 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.605688E+00 | loss scale: 512.0 | grad norm: 8.640 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.334 | TFLOPs: 41.86 | +[default7]: iteration 39/ 6200 | consumed samples: 39936 | consumed tokens: 81788928 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.637872E+00 | loss scale: 512.0 | grad norm: 18.612 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.449 | TFLOPs: 41.90 | +[default7]: iteration 40/ 6200 | consumed samples: 40960 | consumed tokens: 83886080 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.634651E+00 | loss scale: 512.0 | grad norm: 10.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.429 | TFLOPs: 41.89 | +[default7]: iteration 41/ 6200 | consumed samples: 41984 | consumed tokens: 85983232 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.608800E+00 | loss scale: 512.0 | grad norm: 9.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.997 | TFLOPs: 42.06 | +[default7]: iteration 42/ 6200 | consumed samples: 43008 | consumed tokens: 88080384 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.613916E+00 | loss scale: 512.0 | grad norm: 16.071 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.723 | TFLOPs: 41.98 | +[default7]: iteration 43/ 6200 | consumed samples: 44032 | consumed tokens: 90177536 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.605471E+00 | loss scale: 512.0 | grad norm: 8.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.487 | TFLOPs: 41.91 | +[default7]: iteration 44/ 6200 | consumed samples: 45056 | consumed tokens: 92274688 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.587564E+00 | loss scale: 512.0 | grad norm: 6.640 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.604 | TFLOPs: 41.94 | +[default7]: iteration 45/ 6200 | consumed samples: 46080 | consumed tokens: 94371840 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.627640E+00 | loss scale: 512.0 | grad norm: 13.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.218 | TFLOPs: 42.13 | +[default7]: iteration 46/ 6200 | consumed samples: 47104 | consumed tokens: 96468992 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.603470E+00 | loss scale: 512.0 | grad norm: 11.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.800 | TFLOPs: 42.00 | +[default7]: iteration 47/ 6200 | consumed samples: 48128 | consumed tokens: 98566144 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.588888E+00 | loss scale: 512.0 | grad norm: 9.793 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.767 | TFLOPs: 41.99 | +[default7]: iteration 48/ 6200 | consumed samples: 49152 | consumed tokens: 100663296 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.569326E+00 | loss scale: 512.0 | grad norm: 15.969 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.908 | TFLOPs: 42.04 | +[default7]: iteration 49/ 6200 | consumed samples: 50176 | consumed tokens: 102760448 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.590109E+00 | loss scale: 512.0 | grad norm: 18.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.177 | TFLOPs: 42.12 | +[default7]: iteration 50/ 6200 | consumed samples: 51200 | consumed tokens: 104857600 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.577725E+00 | loss scale: 512.0 | grad norm: 14.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.209 | TFLOPs: 42.13 | +[default7]: iteration 51/ 6200 | consumed samples: 52224 | consumed tokens: 106954752 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.532461E+00 | loss scale: 512.0 | grad norm: 7.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.792 | TFLOPs: 42.00 | +[default7]: iteration 52/ 6200 | consumed samples: 53248 | consumed tokens: 109051904 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.613034E+00 | loss scale: 512.0 | grad norm: 14.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.001 | TFLOPs: 42.06 | +[default7]: iteration 53/ 6200 | consumed samples: 54272 | consumed tokens: 111149056 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.568268E+00 | loss scale: 512.0 | grad norm: 14.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.506 | TFLOPs: 41.91 | +[default7]: iteration 54/ 6200 | consumed samples: 55296 | consumed tokens: 113246208 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.553823E+00 | loss scale: 512.0 | grad norm: 7.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.808 | TFLOPs: 41.70 | +[default7]: iteration 55/ 6200 | consumed samples: 56320 | consumed tokens: 115343360 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.557262E+00 | loss scale: 512.0 | grad norm: 10.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.774 | TFLOPs: 41.69 | +[default7]: iteration 56/ 6200 | consumed samples: 57344 | consumed tokens: 117440512 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.558000E+00 | loss scale: 512.0 | grad norm: 10.953 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.758 | TFLOPs: 41.99 | +[default7]: iteration 57/ 6200 | consumed samples: 58368 | consumed tokens: 119537664 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.561691E+00 | loss scale: 512.0 | grad norm: 8.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.107 | TFLOPs: 42.10 | +[default7]: iteration 58/ 6200 | consumed samples: 59392 | consumed tokens: 121634816 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.545578E+00 | loss scale: 512.0 | grad norm: 10.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.191 | TFLOPs: 41.82 | +[default7]: iteration 59/ 6200 | consumed samples: 60416 | consumed tokens: 123731968 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.561702E+00 | loss scale: 512.0 | grad norm: 9.762 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.272 | TFLOPs: 42.15 | +[default7]: iteration 60/ 6200 | consumed samples: 61440 | consumed tokens: 125829120 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.521132E+00 | loss scale: 512.0 | grad norm: 7.843 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.236 | TFLOPs: 42.14 | +[default7]: iteration 61/ 6200 | consumed samples: 62464 | consumed tokens: 127926272 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.521083E+00 | loss scale: 512.0 | grad norm: 10.012 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.142 | TFLOPs: 42.11 | +[default7]: iteration 62/ 6200 | consumed samples: 63488 | consumed tokens: 130023424 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.553130E+00 | loss scale: 512.0 | grad norm: 12.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.994 | TFLOPs: 42.06 | +[default7]: iteration 63/ 6200 | consumed samples: 64512 | consumed tokens: 132120576 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.550875E+00 | loss scale: 512.0 | grad norm: 8.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.261 | TFLOPs: 42.14 | +[default7]: iteration 64/ 6200 | consumed samples: 65536 | consumed tokens: 134217728 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.530173E+00 | loss scale: 512.0 | grad norm: 16.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.271 | TFLOPs: 41.84 | +[default7]: iteration 65/ 6200 | consumed samples: 66560 | consumed tokens: 136314880 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.503808E+00 | loss scale: 512.0 | grad norm: 15.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.512 | TFLOPs: 41.91 | +[default7]: iteration 66/ 6200 | consumed samples: 67584 | consumed tokens: 138412032 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.512312E+00 | loss scale: 512.0 | grad norm: 10.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.208 | TFLOPs: 41.82 | +[default7]: iteration 67/ 6200 | consumed samples: 68608 | consumed tokens: 140509184 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.495471E+00 | loss scale: 512.0 | grad norm: 8.048 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.259 | TFLOPs: 41.84 | +[default7]: iteration 68/ 6200 | consumed samples: 69632 | consumed tokens: 142606336 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.471200E+00 | loss scale: 512.0 | grad norm: 10.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.448 | TFLOPs: 41.90 | +[default7]: iteration 69/ 6200 | consumed samples: 70656 | consumed tokens: 144703488 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.492173E+00 | loss scale: 512.0 | grad norm: 8.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.537 | TFLOPs: 41.92 | +[default7]: iteration 70/ 6200 | consumed samples: 71680 | consumed tokens: 146800640 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.499110E+00 | loss scale: 512.0 | grad norm: 8.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.223 | TFLOPs: 41.83 | +[default7]: iteration 71/ 6200 | consumed samples: 72704 | consumed tokens: 148897792 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.486695E+00 | loss scale: 512.0 | grad norm: 9.054 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.261 | TFLOPs: 41.84 | +[default7]: iteration 72/ 6200 | consumed samples: 73728 | consumed tokens: 150994944 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.478447E+00 | loss scale: 512.0 | grad norm: 7.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.886 | TFLOPs: 41.72 | +[default7]: iteration 73/ 6200 | consumed samples: 74752 | consumed tokens: 153092096 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.508703E+00 | loss scale: 512.0 | grad norm: 6.821 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.172 | TFLOPs: 41.81 | +[default7]: iteration 74/ 6200 | consumed samples: 75776 | consumed tokens: 155189248 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.485006E+00 | loss scale: 512.0 | grad norm: 8.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.042 | TFLOPs: 41.77 | +[default7]: iteration 75/ 6200 | consumed samples: 76800 | consumed tokens: 157286400 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.494733E+00 | loss scale: 512.0 | grad norm: 7.750 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.979 | TFLOPs: 41.75 | +[default7]: iteration 76/ 6200 | consumed samples: 77824 | consumed tokens: 159383552 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.487006E+00 | loss scale: 512.0 | grad norm: 6.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.244 | TFLOPs: 41.83 | +[default7]: iteration 77/ 6200 | consumed samples: 78848 | consumed tokens: 161480704 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.471199E+00 | loss scale: 512.0 | grad norm: 9.035 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.318 | TFLOPs: 41.86 | +[default7]: iteration 78/ 6200 | consumed samples: 79872 | consumed tokens: 163577856 | elapsed time per iteration (s): 7.51 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.480666E+00 | loss scale: 512.0 | grad norm: 6.721 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.409 | TFLOPs: 41.58 | +[default7]: iteration 79/ 6200 | consumed samples: 80896 | consumed tokens: 165675008 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.468760E+00 | loss scale: 512.0 | grad norm: 12.003 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.511 | TFLOPs: 41.61 | +[default7]: iteration 80/ 6200 | consumed samples: 81920 | consumed tokens: 167772160 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.488024E+00 | loss scale: 512.0 | grad norm: 11.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.321 | TFLOPs: 41.86 | +[default7]: iteration 81/ 6200 | consumed samples: 82944 | consumed tokens: 169869312 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.462920E+00 | loss scale: 512.0 | grad norm: 6.926 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.359 | TFLOPs: 41.87 | +[default7]: iteration 82/ 6200 | consumed samples: 83968 | consumed tokens: 171966464 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.477760E+00 | loss scale: 512.0 | grad norm: 8.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.326 | TFLOPs: 41.86 | +[default7]: iteration 83/ 6200 | consumed samples: 84992 | consumed tokens: 174063616 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.443093E+00 | loss scale: 512.0 | grad norm: 10.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.184 | TFLOPs: 41.81 | +[default7]: iteration 84/ 6200 | consumed samples: 86016 | consumed tokens: 176160768 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.466633E+00 | loss scale: 512.0 | grad norm: 8.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.429 | TFLOPs: 41.89 | +[default7]: iteration 85/ 6200 | consumed samples: 87040 | consumed tokens: 178257920 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.459766E+00 | loss scale: 512.0 | grad norm: 8.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.082 | TFLOPs: 41.78 | +[default7]: iteration 86/ 6200 | consumed samples: 88064 | consumed tokens: 180355072 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.447457E+00 | loss scale: 512.0 | grad norm: 14.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.091 | TFLOPs: 41.79 | +[default7]: iteration 87/ 6200 | consumed samples: 89088 | consumed tokens: 182452224 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.470727E+00 | loss scale: 512.0 | grad norm: 9.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.850 | TFLOPs: 41.71 | +[default7]: iteration 88/ 6200 | consumed samples: 90112 | consumed tokens: 184549376 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.462646E+00 | loss scale: 512.0 | grad norm: 9.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.927 | TFLOPs: 41.74 | +[default7]: iteration 89/ 6200 | consumed samples: 91136 | consumed tokens: 186646528 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.441497E+00 | loss scale: 512.0 | grad norm: 9.839 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.222 | TFLOPs: 41.83 | +[default7]: iteration 90/ 6200 | consumed samples: 92160 | consumed tokens: 188743680 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.431493E+00 | loss scale: 512.0 | grad norm: 11.786 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.382 | TFLOPs: 41.88 | +[default7]: iteration 91/ 6200 | consumed samples: 93184 | consumed tokens: 190840832 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.439633E+00 | loss scale: 512.0 | grad norm: 6.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.265 | TFLOPs: 41.84 | +[default7]: iteration 92/ 6200 | consumed samples: 94208 | consumed tokens: 192937984 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.453082E+00 | loss scale: 512.0 | grad norm: 13.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.186 | TFLOPs: 41.82 | +[default7]: iteration 93/ 6200 | consumed samples: 95232 | consumed tokens: 195035136 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.434034E+00 | loss scale: 512.0 | grad norm: 11.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.712 | TFLOPs: 41.67 | +[default7]: iteration 94/ 6200 | consumed samples: 96256 | consumed tokens: 197132288 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.480131E+00 | loss scale: 512.0 | grad norm: 14.077 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.722 | TFLOPs: 41.67 | +[default7]: iteration 95/ 6200 | consumed samples: 97280 | consumed tokens: 199229440 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.444450E+00 | loss scale: 512.0 | grad norm: 8.844 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.553 | TFLOPs: 41.62 | +[default7]: iteration 96/ 6200 | consumed samples: 98304 | consumed tokens: 201326592 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.445171E+00 | loss scale: 512.0 | grad norm: 11.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.526 | TFLOPs: 41.61 | +[default7]: iteration 97/ 6200 | consumed samples: 99328 | consumed tokens: 203423744 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.410716E+00 | loss scale: 512.0 | grad norm: 12.762 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.586 | TFLOPs: 41.63 | +[default7]: iteration 98/ 6200 | consumed samples: 100352 | consumed tokens: 205520896 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.445601E+00 | loss scale: 512.0 | grad norm: 7.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.620 | TFLOPs: 41.64 | +[default7]: iteration 99/ 6200 | consumed samples: 101376 | consumed tokens: 207618048 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.434500E+00 | loss scale: 512.0 | grad norm: 14.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.513 | TFLOPs: 41.61 | +[default7]: iteration 100/ 6200 | consumed samples: 102400 | consumed tokens: 209715200 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.418506E+00 | loss scale: 512.0 | grad norm: 11.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.648 | TFLOPs: 41.65 | +[default7]: iteration 101/ 6200 | consumed samples: 103424 | consumed tokens: 211812352 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.423526E+00 | loss scale: 512.0 | grad norm: 7.847 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.285 | TFLOPs: 41.85 | +[default7]: iteration 102/ 6200 | consumed samples: 104448 | consumed tokens: 213909504 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.398441E+00 | loss scale: 512.0 | grad norm: 9.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.195 | TFLOPs: 41.82 | +[default7]: iteration 103/ 6200 | consumed samples: 105472 | consumed tokens: 216006656 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.404961E+00 | loss scale: 512.0 | grad norm: 9.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.019 | TFLOPs: 41.76 | +[default7]: iteration 104/ 6200 | consumed samples: 106496 | consumed tokens: 218103808 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.401111E+00 | loss scale: 512.0 | grad norm: 7.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.228 | TFLOPs: 41.83 | +[default7]: iteration 105/ 6200 | consumed samples: 107520 | consumed tokens: 220200960 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.409334E+00 | loss scale: 512.0 | grad norm: 7.640 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.144 | TFLOPs: 41.80 | +[default7]: iteration 106/ 6200 | consumed samples: 108544 | consumed tokens: 222298112 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.396360E+00 | loss scale: 512.0 | grad norm: 13.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.692 | TFLOPs: 41.66 | +[default7]: iteration 107/ 6200 | consumed samples: 109568 | consumed tokens: 224395264 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.385433E+00 | loss scale: 512.0 | grad norm: 10.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.573 | TFLOPs: 41.63 | +[default7]: iteration 108/ 6200 | consumed samples: 110592 | consumed tokens: 226492416 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.393529E+00 | loss scale: 512.0 | grad norm: 6.820 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.617 | TFLOPs: 41.64 | +[default7]: iteration 109/ 6200 | consumed samples: 111616 | consumed tokens: 228589568 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.416043E+00 | loss scale: 512.0 | grad norm: 11.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.880 | TFLOPs: 41.72 | +[default7]: iteration 110/ 6200 | consumed samples: 112640 | consumed tokens: 230686720 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.390680E+00 | loss scale: 512.0 | grad norm: 12.033 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.787 | TFLOPs: 41.69 | +[default7]: iteration 111/ 6200 | consumed samples: 113664 | consumed tokens: 232783872 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.412825E+00 | loss scale: 512.0 | grad norm: 7.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.561 | TFLOPs: 41.62 | +[default7]: iteration 112/ 6200 | consumed samples: 114688 | consumed tokens: 234881024 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.391932E+00 | loss scale: 512.0 | grad norm: 11.785 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.959 | TFLOPs: 41.75 | +[default7]: iteration 113/ 6200 | consumed samples: 115712 | consumed tokens: 236978176 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.442313E+00 | loss scale: 512.0 | grad norm: 11.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.867 | TFLOPs: 41.72 | +[default7]: iteration 114/ 6200 | consumed samples: 116736 | consumed tokens: 239075328 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.412243E+00 | loss scale: 512.0 | grad norm: 8.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.480 | TFLOPs: 41.90 | +[default7]: iteration 115/ 6200 | consumed samples: 117760 | consumed tokens: 241172480 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.398322E+00 | loss scale: 512.0 | grad norm: 7.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.278 | TFLOPs: 41.84 | +[default7]: iteration 116/ 6200 | consumed samples: 118784 | consumed tokens: 243269632 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.407450E+00 | loss scale: 512.0 | grad norm: 12.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.813 | TFLOPs: 42.01 | +[default7]: iteration 117/ 6200 | consumed samples: 119808 | consumed tokens: 245366784 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.388371E+00 | loss scale: 512.0 | grad norm: 15.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.383 | TFLOPs: 41.88 | +[default7]: iteration 118/ 6200 | consumed samples: 120832 | consumed tokens: 247463936 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.371598E+00 | loss scale: 512.0 | grad norm: 9.929 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.247 | TFLOPs: 41.83 | +[default7]: iteration 119/ 6200 | consumed samples: 121856 | consumed tokens: 249561088 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.395462E+00 | loss scale: 512.0 | grad norm: 7.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.207 | TFLOPs: 41.82 | +[default7]: iteration 120/ 6200 | consumed samples: 122880 | consumed tokens: 251658240 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.396026E+00 | loss scale: 512.0 | grad norm: 15.906 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.172 | TFLOPs: 41.81 | +[default7]: iteration 121/ 6200 | consumed samples: 123904 | consumed tokens: 253755392 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.397779E+00 | loss scale: 512.0 | grad norm: 10.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.523 | TFLOPs: 41.92 | +[default7]: iteration 122/ 6200 | consumed samples: 124928 | consumed tokens: 255852544 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.406634E+00 | loss scale: 512.0 | grad norm: 9.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.393 | TFLOPs: 41.88 | +[default7]: iteration 123/ 6200 | consumed samples: 125952 | consumed tokens: 257949696 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.400014E+00 | loss scale: 512.0 | grad norm: 11.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.220 | TFLOPs: 41.83 | +[default7]: iteration 124/ 6200 | consumed samples: 126976 | consumed tokens: 260046848 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.382730E+00 | loss scale: 512.0 | grad norm: 13.942 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.373 | TFLOPs: 41.87 | +[default7]: iteration 125/ 6200 | consumed samples: 128000 | consumed tokens: 262144000 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.391176E+00 | loss scale: 512.0 | grad norm: 8.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.032 | TFLOPs: 41.77 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 125 | lm loss value: 3.350115E+00 | lm loss PPL: 2.850600E+01 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default7]:---------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 125 | lm loss value: 2.282754E+00 | lm loss PPL: 9.803647E+00 | +[default7]:---------------------------------------------------------------------------------------------- +[default7]: iteration 126/ 6200 | consumed samples: 129024 | consumed tokens: 264241152 | elapsed time per iteration (s): 52.80 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.393147E+00 | loss scale: 512.0 | grad norm: 6.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.392 | TFLOPs: 5.91 | +[default7]: iteration 127/ 6200 | consumed samples: 130048 | consumed tokens: 266338304 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.378163E+00 | loss scale: 512.0 | grad norm: 13.920 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.646 | TFLOPs: 41.96 | +[default7]: iteration 128/ 6200 | consumed samples: 131072 | consumed tokens: 268435456 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.384832E+00 | loss scale: 512.0 | grad norm: 14.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.087 | TFLOPs: 41.79 | +[default7]: iteration 129/ 6200 | consumed samples: 132096 | consumed tokens: 270532608 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.385854E+00 | loss scale: 512.0 | grad norm: 11.989 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.319 | TFLOPs: 41.86 | +[default7]: iteration 130/ 6200 | consumed samples: 133120 | consumed tokens: 272629760 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.411374E+00 | loss scale: 512.0 | grad norm: 7.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.649 | TFLOPs: 41.96 | +[default7]: iteration 131/ 6200 | consumed samples: 134144 | consumed tokens: 274726912 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.385915E+00 | loss scale: 512.0 | grad norm: 12.939 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.189 | TFLOPs: 41.82 | +[default7]: iteration 132/ 6200 | consumed samples: 135168 | consumed tokens: 276824064 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.413146E+00 | loss scale: 512.0 | grad norm: 12.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.275 | TFLOPs: 41.84 | +[default7]: iteration 133/ 6200 | consumed samples: 136192 | consumed tokens: 278921216 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.388109E+00 | loss scale: 512.0 | grad norm: 8.030 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.336 | TFLOPs: 41.86 | +[default7]: iteration 134/ 6200 | consumed samples: 137216 | consumed tokens: 281018368 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.385558E+00 | loss scale: 512.0 | grad norm: 9.700 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.754 | TFLOPs: 41.99 | +[default7]: iteration 135/ 6200 | consumed samples: 138240 | consumed tokens: 283115520 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.360871E+00 | loss scale: 512.0 | grad norm: 11.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.450 | TFLOPs: 41.90 | +[default7]: iteration 136/ 6200 | consumed samples: 139264 | consumed tokens: 285212672 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.371682E+00 | loss scale: 512.0 | grad norm: 7.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.331 | TFLOPs: 41.86 | +[default7]: iteration 137/ 6200 | consumed samples: 140288 | consumed tokens: 287309824 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.383505E+00 | loss scale: 512.0 | grad norm: 6.741 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.661 | TFLOPs: 41.96 | +[default7]: iteration 138/ 6200 | consumed samples: 141312 | consumed tokens: 289406976 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.365808E+00 | loss scale: 512.0 | grad norm: 10.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.855 | TFLOPs: 42.02 | +[default7]: iteration 139/ 6200 | consumed samples: 142336 | consumed tokens: 291504128 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.381868E+00 | loss scale: 512.0 | grad norm: 8.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.599 | TFLOPs: 41.94 | +[default7]: iteration 140/ 6200 | consumed samples: 143360 | consumed tokens: 293601280 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.368825E+00 | loss scale: 512.0 | grad norm: 7.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.450 | TFLOPs: 41.90 | +[default7]: iteration 141/ 6200 | consumed samples: 144384 | consumed tokens: 295698432 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.377739E+00 | loss scale: 512.0 | grad norm: 7.612 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.545 | TFLOPs: 41.92 | +[default7]: iteration 142/ 6200 | consumed samples: 145408 | consumed tokens: 297795584 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.380018E+00 | loss scale: 512.0 | grad norm: 11.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.846 | TFLOPs: 42.02 | +[default7]: iteration 143/ 6200 | consumed samples: 146432 | consumed tokens: 299892736 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.373972E+00 | loss scale: 512.0 | grad norm: 10.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.000 | TFLOPs: 42.06 | +[default7]: iteration 144/ 6200 | consumed samples: 147456 | consumed tokens: 301989888 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.374944E+00 | loss scale: 512.0 | grad norm: 7.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.177 | TFLOPs: 42.12 | +[default7]: iteration 145/ 6200 | consumed samples: 148480 | consumed tokens: 304087040 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.353444E+00 | loss scale: 512.0 | grad norm: 7.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.150 | TFLOPs: 41.80 | +[default7]: iteration 146/ 6200 | consumed samples: 149504 | consumed tokens: 306184192 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.376900E+00 | loss scale: 512.0 | grad norm: 8.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.725 | TFLOPs: 41.98 | +[default7]: iteration 147/ 6200 | consumed samples: 150528 | consumed tokens: 308281344 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.372908E+00 | loss scale: 512.0 | grad norm: 6.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.132 | TFLOPs: 41.80 | +[default7]: iteration 148/ 6200 | consumed samples: 151552 | consumed tokens: 310378496 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.391910E+00 | loss scale: 512.0 | grad norm: 7.945 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.244 | TFLOPs: 41.83 | +[default7]: iteration 149/ 6200 | consumed samples: 152576 | consumed tokens: 312475648 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.369546E+00 | loss scale: 512.0 | grad norm: 7.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.284 | TFLOPs: 41.85 | +[default7]: iteration 150/ 6200 | consumed samples: 153600 | consumed tokens: 314572800 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.365368E+00 | loss scale: 512.0 | grad norm: 8.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.043 | TFLOPs: 41.77 | +[default7]: iteration 151/ 6200 | consumed samples: 154624 | consumed tokens: 316669952 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.375805E+00 | loss scale: 512.0 | grad norm: 6.622 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.161 | TFLOPs: 41.81 | +[default7]: iteration 152/ 6200 | consumed samples: 155648 | consumed tokens: 318767104 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.361421E+00 | loss scale: 512.0 | grad norm: 8.762 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.567 | TFLOPs: 41.93 | +[default7]: iteration 153/ 6200 | consumed samples: 156672 | consumed tokens: 320864256 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.343291E+00 | loss scale: 512.0 | grad norm: 6.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.588 | TFLOPs: 41.94 | +[default7]: iteration 154/ 6200 | consumed samples: 157696 | consumed tokens: 322961408 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.338793E+00 | loss scale: 512.0 | grad norm: 9.980 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.045 | TFLOPs: 42.08 | +[default7]: iteration 155/ 6200 | consumed samples: 158720 | consumed tokens: 325058560 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.360717E+00 | loss scale: 512.0 | grad norm: 7.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.777 | TFLOPs: 42.00 | +[default7]: iteration 156/ 6200 | consumed samples: 159744 | consumed tokens: 327155712 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.359937E+00 | loss scale: 512.0 | grad norm: 7.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.121 | TFLOPs: 42.10 | +[default7]: iteration 157/ 6200 | consumed samples: 160768 | consumed tokens: 329252864 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.378606E+00 | loss scale: 512.0 | grad norm: 6.744 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.114 | TFLOPs: 42.10 | +[default7]: iteration 158/ 6200 | consumed samples: 161792 | consumed tokens: 331350016 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.325594E+00 | loss scale: 512.0 | grad norm: 7.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.950 | TFLOPs: 42.05 | +[default7]: iteration 159/ 6200 | consumed samples: 162816 | consumed tokens: 333447168 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.350003E+00 | loss scale: 512.0 | grad norm: 7.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.868 | TFLOPs: 42.02 | +[default7]: iteration 160/ 6200 | consumed samples: 163840 | consumed tokens: 335544320 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.365440E+00 | loss scale: 512.0 | grad norm: 9.809 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.832 | TFLOPs: 42.01 | +[default7]: iteration 161/ 6200 | consumed samples: 164864 | consumed tokens: 337641472 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.371936E+00 | loss scale: 512.0 | grad norm: 7.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.105 | TFLOPs: 42.10 | +[default7]: iteration 162/ 6200 | consumed samples: 165888 | consumed tokens: 339738624 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.353406E+00 | loss scale: 512.0 | grad norm: 7.049 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.790 | TFLOPs: 42.00 | +[default7]: iteration 163/ 6200 | consumed samples: 166912 | consumed tokens: 341835776 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.339454E+00 | loss scale: 512.0 | grad norm: 7.750 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.826 | TFLOPs: 42.01 | +[default7]: iteration 164/ 6200 | consumed samples: 167936 | consumed tokens: 343932928 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.333943E+00 | loss scale: 512.0 | grad norm: 7.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.004 | TFLOPs: 42.06 | +[default7]: iteration 165/ 6200 | consumed samples: 168960 | consumed tokens: 346030080 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.339835E+00 | loss scale: 512.0 | grad norm: 6.739 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.088 | TFLOPs: 42.09 | +[default7]: iteration 166/ 6200 | consumed samples: 169984 | consumed tokens: 348127232 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.338535E+00 | loss scale: 512.0 | grad norm: 5.798 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.307 | TFLOPs: 42.16 | +[default7]: iteration 167/ 6200 | consumed samples: 171008 | consumed tokens: 350224384 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.362861E+00 | loss scale: 512.0 | grad norm: 7.790 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.820 | TFLOPs: 42.01 | +[default7]: iteration 168/ 6200 | consumed samples: 172032 | consumed tokens: 352321536 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.305393E+00 | loss scale: 512.0 | grad norm: 6.875 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.271 | TFLOPs: 42.15 | +[default7]: iteration 169/ 6200 | consumed samples: 173056 | consumed tokens: 354418688 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.320323E+00 | loss scale: 512.0 | grad norm: 8.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.119 | TFLOPs: 42.10 | +[default7]: iteration 170/ 6200 | consumed samples: 174080 | consumed tokens: 356515840 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.344213E+00 | loss scale: 512.0 | grad norm: 6.734 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.735 | TFLOPs: 41.98 | +[default7]: iteration 171/ 6200 | consumed samples: 175104 | consumed tokens: 358612992 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.299716E+00 | loss scale: 512.0 | grad norm: 6.801 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.894 | TFLOPs: 42.03 | +[default7]: iteration 172/ 6200 | consumed samples: 176128 | consumed tokens: 360710144 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.334410E+00 | loss scale: 512.0 | grad norm: 9.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.846 | TFLOPs: 42.02 | +[default7]: iteration 173/ 6200 | consumed samples: 177152 | consumed tokens: 362807296 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.337914E+00 | loss scale: 512.0 | grad norm: 7.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.846 | TFLOPs: 42.02 | +[default7]: iteration 174/ 6200 | consumed samples: 178176 | consumed tokens: 364904448 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.337559E+00 | loss scale: 512.0 | grad norm: 7.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.015 | TFLOPs: 42.07 | +[default7]: iteration 175/ 6200 | consumed samples: 179200 | consumed tokens: 367001600 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.317107E+00 | loss scale: 512.0 | grad norm: 6.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.871 | TFLOPs: 42.02 | +[default7]: iteration 176/ 6200 | consumed samples: 180224 | consumed tokens: 369098752 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.326814E+00 | loss scale: 512.0 | grad norm: 7.953 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.674 | TFLOPs: 41.96 | +[default7]: iteration 177/ 6200 | consumed samples: 181248 | consumed tokens: 371195904 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.311367E+00 | loss scale: 512.0 | grad norm: 8.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.148 | TFLOPs: 42.11 | +[default7]: iteration 178/ 6200 | consumed samples: 182272 | consumed tokens: 373293056 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.341543E+00 | loss scale: 512.0 | grad norm: 7.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.647 | TFLOPs: 41.96 | +[default7]: iteration 179/ 6200 | consumed samples: 183296 | consumed tokens: 375390208 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.307726E+00 | loss scale: 512.0 | grad norm: 10.704 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.710 | TFLOPs: 41.98 | +[default7]: iteration 180/ 6200 | consumed samples: 184320 | consumed tokens: 377487360 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.348732E+00 | loss scale: 512.0 | grad norm: 7.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.899 | TFLOPs: 42.03 | +[default7]: iteration 181/ 6200 | consumed samples: 185344 | consumed tokens: 379584512 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.318314E+00 | loss scale: 512.0 | grad norm: 6.943 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.955 | TFLOPs: 42.05 | +[default7]: iteration 182/ 6200 | consumed samples: 186368 | consumed tokens: 381681664 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.327614E+00 | loss scale: 512.0 | grad norm: 9.814 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.532 | TFLOPs: 41.92 | +[default7]: iteration 183/ 6200 | consumed samples: 187392 | consumed tokens: 383778816 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.295839E+00 | loss scale: 512.0 | grad norm: 7.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.931 | TFLOPs: 42.04 | +[default7]: iteration 184/ 6200 | consumed samples: 188416 | consumed tokens: 385875968 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.320659E+00 | loss scale: 512.0 | grad norm: 7.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.886 | TFLOPs: 42.03 | +[default7]: iteration 185/ 6200 | consumed samples: 189440 | consumed tokens: 387973120 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.301526E+00 | loss scale: 512.0 | grad norm: 8.933 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.703 | TFLOPs: 41.97 | +[default7]: iteration 186/ 6200 | consumed samples: 190464 | consumed tokens: 390070272 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.333180E+00 | loss scale: 512.0 | grad norm: 9.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.014 | TFLOPs: 42.07 | +[default7]: iteration 187/ 6200 | consumed samples: 191488 | consumed tokens: 392167424 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.326668E+00 | loss scale: 512.0 | grad norm: 6.940 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.553 | TFLOPs: 41.93 | +[default7]: iteration 188/ 6200 | consumed samples: 192512 | consumed tokens: 394264576 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.315532E+00 | loss scale: 512.0 | grad norm: 8.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.499 | TFLOPs: 41.91 | +[default7]: iteration 189/ 6200 | consumed samples: 193536 | consumed tokens: 396361728 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.320204E+00 | loss scale: 512.0 | grad norm: 8.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.855 | TFLOPs: 42.02 | +[default7]: iteration 190/ 6200 | consumed samples: 194560 | consumed tokens: 398458880 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.305483E+00 | loss scale: 512.0 | grad norm: 5.658 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.722 | TFLOPs: 41.98 | +[default7]: iteration 191/ 6200 | consumed samples: 195584 | consumed tokens: 400556032 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.313178E+00 | loss scale: 512.0 | grad norm: 10.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.046 | TFLOPs: 42.08 | +[default7]: iteration 192/ 6200 | consumed samples: 196608 | consumed tokens: 402653184 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.310657E+00 | loss scale: 512.0 | grad norm: 6.877 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.898 | TFLOPs: 42.03 | +[default7]: iteration 193/ 6200 | consumed samples: 197632 | consumed tokens: 404750336 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.322167E+00 | loss scale: 512.0 | grad norm: 6.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.065 | TFLOPs: 42.08 | +[default7]: iteration 194/ 6200 | consumed samples: 198656 | consumed tokens: 406847488 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.344082E+00 | loss scale: 512.0 | grad norm: 6.760 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.250 | TFLOPs: 41.83 | +[default7]: iteration 195/ 6200 | consumed samples: 199680 | consumed tokens: 408944640 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.331860E+00 | loss scale: 512.0 | grad norm: 8.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.025 | TFLOPs: 42.07 | +[default7]: iteration 196/ 6200 | consumed samples: 200704 | consumed tokens: 411041792 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.300983E+00 | loss scale: 512.0 | grad norm: 6.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.839 | TFLOPs: 42.01 | +[default7]: iteration 197/ 6200 | consumed samples: 201728 | consumed tokens: 413138944 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.325945E+00 | loss scale: 512.0 | grad norm: 9.689 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.763 | TFLOPs: 41.99 | +[default7]: iteration 198/ 6200 | consumed samples: 202752 | consumed tokens: 415236096 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.302954E+00 | loss scale: 512.0 | grad norm: 7.981 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.933 | TFLOPs: 42.04 | +[default7]: iteration 199/ 6200 | consumed samples: 203776 | consumed tokens: 417333248 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.295123E+00 | loss scale: 512.0 | grad norm: 8.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.006 | TFLOPs: 42.07 | +[default7]: iteration 200/ 6200 | consumed samples: 204800 | consumed tokens: 419430400 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.326628E+00 | loss scale: 512.0 | grad norm: 6.678 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.725 | TFLOPs: 41.98 | +[default7]: iteration 201/ 6200 | consumed samples: 205824 | consumed tokens: 421527552 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.310267E+00 | loss scale: 512.0 | grad norm: 7.643 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.624 | TFLOPs: 41.95 | +[default7]: iteration 202/ 6200 | consumed samples: 206848 | consumed tokens: 423624704 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.307374E+00 | loss scale: 512.0 | grad norm: 8.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.762 | TFLOPs: 41.99 | +[default7]: iteration 203/ 6200 | consumed samples: 207872 | consumed tokens: 425721856 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.327124E+00 | loss scale: 512.0 | grad norm: 6.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.275 | TFLOPs: 42.15 | +[default7]: iteration 204/ 6200 | consumed samples: 208896 | consumed tokens: 427819008 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.291371E+00 | loss scale: 512.0 | grad norm: 6.908 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.897 | TFLOPs: 42.03 | +[default7]: iteration 205/ 6200 | consumed samples: 209920 | consumed tokens: 429916160 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.318806E+00 | loss scale: 512.0 | grad norm: 7.043 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.597 | TFLOPs: 41.94 | +[default7]: iteration 206/ 6200 | consumed samples: 210944 | consumed tokens: 432013312 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.301414E+00 | loss scale: 512.0 | grad norm: 8.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.704 | TFLOPs: 41.97 | +[default7]: iteration 207/ 6200 | consumed samples: 211968 | consumed tokens: 434110464 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.292867E+00 | loss scale: 512.0 | grad norm: 9.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.950 | TFLOPs: 42.05 | +[default7]: iteration 208/ 6200 | consumed samples: 212992 | consumed tokens: 436207616 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.299553E+00 | loss scale: 512.0 | grad norm: 6.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.156 | TFLOPs: 42.11 | +[default7]: iteration 209/ 6200 | consumed samples: 214016 | consumed tokens: 438304768 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.324435E+00 | loss scale: 512.0 | grad norm: 7.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.008 | TFLOPs: 42.07 | +[default7]: iteration 210/ 6200 | consumed samples: 215040 | consumed tokens: 440401920 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.289268E+00 | loss scale: 512.0 | grad norm: 9.996 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.779 | TFLOPs: 42.00 | +[default7]: iteration 211/ 6200 | consumed samples: 216064 | consumed tokens: 442499072 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.323461E+00 | loss scale: 512.0 | grad norm: 7.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.805 | TFLOPs: 42.00 | +[default7]: iteration 212/ 6200 | consumed samples: 217088 | consumed tokens: 444596224 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.281541E+00 | loss scale: 512.0 | grad norm: 6.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.872 | TFLOPs: 42.02 | +[default7]: iteration 213/ 6200 | consumed samples: 218112 | consumed tokens: 446693376 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.294707E+00 | loss scale: 512.0 | grad norm: 9.733 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.794 | TFLOPs: 42.00 | +[default7]: iteration 214/ 6200 | consumed samples: 219136 | consumed tokens: 448790528 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.285701E+00 | loss scale: 512.0 | grad norm: 8.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.541 | TFLOPs: 41.92 | +[default7]: iteration 215/ 6200 | consumed samples: 220160 | consumed tokens: 450887680 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.286628E+00 | loss scale: 512.0 | grad norm: 8.790 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.710 | TFLOPs: 41.98 | +[default7]: iteration 216/ 6200 | consumed samples: 221184 | consumed tokens: 452984832 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.297454E+00 | loss scale: 512.0 | grad norm: 7.036 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.136 | TFLOPs: 42.10 | +[default7]: iteration 217/ 6200 | consumed samples: 222208 | consumed tokens: 455081984 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.282178E+00 | loss scale: 512.0 | grad norm: 5.851 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.840 | TFLOPs: 42.01 | +[default7]: iteration 218/ 6200 | consumed samples: 223232 | consumed tokens: 457179136 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.297174E+00 | loss scale: 512.0 | grad norm: 8.762 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.819 | TFLOPs: 41.70 | +[default7]: iteration 219/ 6200 | consumed samples: 224256 | consumed tokens: 459276288 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.285686E+00 | loss scale: 512.0 | grad norm: 6.900 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.681 | TFLOPs: 41.97 | +[default7]: iteration 220/ 6200 | consumed samples: 225280 | consumed tokens: 461373440 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.273198E+00 | loss scale: 512.0 | grad norm: 7.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.434 | TFLOPs: 41.89 | +[default7]: iteration 221/ 6200 | consumed samples: 226304 | consumed tokens: 463470592 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.294319E+00 | loss scale: 512.0 | grad norm: 9.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.262 | TFLOPs: 41.84 | +[default7]: iteration 222/ 6200 | consumed samples: 227328 | consumed tokens: 465567744 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.295561E+00 | loss scale: 512.0 | grad norm: 7.625 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.267 | TFLOPs: 41.84 | +[default7]: iteration 223/ 6200 | consumed samples: 228352 | consumed tokens: 467664896 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.305871E+00 | loss scale: 512.0 | grad norm: 9.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.529 | TFLOPs: 41.92 | +[default7]: iteration 224/ 6200 | consumed samples: 229376 | consumed tokens: 469762048 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.298592E+00 | loss scale: 512.0 | grad norm: 7.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.323 | TFLOPs: 41.86 | +[default7]: iteration 225/ 6200 | consumed samples: 230400 | consumed tokens: 471859200 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.245916E+00 | loss scale: 512.0 | grad norm: 7.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.319 | TFLOPs: 41.86 | +[default7]: iteration 226/ 6200 | consumed samples: 231424 | consumed tokens: 473956352 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.304889E+00 | loss scale: 512.0 | grad norm: 6.757 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.582 | TFLOPs: 41.94 | +[default7]: iteration 227/ 6200 | consumed samples: 232448 | consumed tokens: 476053504 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.295489E+00 | loss scale: 512.0 | grad norm: 6.940 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.216 | TFLOPs: 41.82 | +[default7]: iteration 228/ 6200 | consumed samples: 233472 | consumed tokens: 478150656 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.288088E+00 | loss scale: 512.0 | grad norm: 11.959 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.365 | TFLOPs: 41.87 | +[default7]: iteration 229/ 6200 | consumed samples: 234496 | consumed tokens: 480247808 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.299755E+00 | loss scale: 512.0 | grad norm: 8.776 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.156 | TFLOPs: 41.81 | +[default7]: iteration 230/ 6200 | consumed samples: 235520 | consumed tokens: 482344960 | elapsed time per iteration (s): 7.51 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.257885E+00 | loss scale: 512.0 | grad norm: 7.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.367 | TFLOPs: 41.57 | +[default7]: iteration 231/ 6200 | consumed samples: 236544 | consumed tokens: 484442112 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.268943E+00 | loss scale: 512.0 | grad norm: 7.779 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.148 | TFLOPs: 41.80 | +[default7]: iteration 232/ 6200 | consumed samples: 237568 | consumed tokens: 486539264 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.299350E+00 | loss scale: 512.0 | grad norm: 7.706 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.354 | TFLOPs: 41.87 | +[default7]: iteration 233/ 6200 | consumed samples: 238592 | consumed tokens: 488636416 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.259674E+00 | loss scale: 512.0 | grad norm: 7.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.460 | TFLOPs: 41.90 | +[default7]: iteration 234/ 6200 | consumed samples: 239616 | consumed tokens: 490733568 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.277792E+00 | loss scale: 512.0 | grad norm: 7.985 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.232 | TFLOPs: 41.83 | +[default7]: iteration 235/ 6200 | consumed samples: 240640 | consumed tokens: 492830720 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.262034E+00 | loss scale: 512.0 | grad norm: 6.729 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.712 | TFLOPs: 41.98 | +[default7]: iteration 236/ 6200 | consumed samples: 241664 | consumed tokens: 494927872 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.282551E+00 | loss scale: 512.0 | grad norm: 7.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.537 | TFLOPs: 41.92 | +[default7]: iteration 237/ 6200 | consumed samples: 242688 | consumed tokens: 497025024 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.306879E+00 | loss scale: 512.0 | grad norm: 6.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.798 | TFLOPs: 42.00 | +[default7]: iteration 238/ 6200 | consumed samples: 243712 | consumed tokens: 499122176 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.299335E+00 | loss scale: 512.0 | grad norm: 9.986 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.617 | TFLOPs: 41.95 | +[default7]: iteration 239/ 6200 | consumed samples: 244736 | consumed tokens: 501219328 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.277647E+00 | loss scale: 512.0 | grad norm: 7.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.662 | TFLOPs: 41.96 | +[default7]: iteration 240/ 6200 | consumed samples: 245760 | consumed tokens: 503316480 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.292577E+00 | loss scale: 512.0 | grad norm: 8.796 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.124 | TFLOPs: 41.80 | +[default7]: iteration 241/ 6200 | consumed samples: 246784 | consumed tokens: 505413632 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.286413E+00 | loss scale: 512.0 | grad norm: 7.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.587 | TFLOPs: 41.94 | +[default7]: iteration 242/ 6200 | consumed samples: 247808 | consumed tokens: 507510784 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.258213E+00 | loss scale: 512.0 | grad norm: 5.958 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.787 | TFLOPs: 42.00 | +[default7]: iteration 243/ 6200 | consumed samples: 248832 | consumed tokens: 509607936 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.270693E+00 | loss scale: 512.0 | grad norm: 11.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.033 | TFLOPs: 42.07 | +[default7]: iteration 244/ 6200 | consumed samples: 249856 | consumed tokens: 511705088 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.287351E+00 | loss scale: 512.0 | grad norm: 7.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.750 | TFLOPs: 41.99 | +[default7]: iteration 245/ 6200 | consumed samples: 250880 | consumed tokens: 513802240 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.245288E+00 | loss scale: 512.0 | grad norm: 8.979 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.389 | TFLOPs: 42.18 | +[default7]: iteration 246/ 6200 | consumed samples: 251904 | consumed tokens: 515899392 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.266739E+00 | loss scale: 512.0 | grad norm: 8.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.278 | TFLOPs: 42.15 | +[default7]: iteration 247/ 6200 | consumed samples: 252928 | consumed tokens: 517996544 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.263958E+00 | loss scale: 512.0 | grad norm: 9.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.530 | TFLOPs: 42.23 | +[default7]: iteration 248/ 6200 | consumed samples: 253952 | consumed tokens: 520093696 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.274333E+00 | loss scale: 512.0 | grad norm: 8.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.324 | TFLOPs: 42.16 | +[default7]: iteration 249/ 6200 | consumed samples: 254976 | consumed tokens: 522190848 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.267277E+00 | loss scale: 512.0 | grad norm: 7.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.505 | TFLOPs: 42.22 | +[default7]: iteration 250/ 6200 | consumed samples: 256000 | consumed tokens: 524288000 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.264982E+00 | loss scale: 512.0 | grad norm: 6.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.447 | TFLOPs: 42.20 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 250 | lm loss value: 3.365986E+00 | lm loss PPL: 2.896203E+01 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default7]:---------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 250 | lm loss value: 2.170424E+00 | lm loss PPL: 8.762002E+00 | +[default7]:---------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 250 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 13:14:51,139] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step250 is begin to save! +[default0]:[2022-10-06 13:14:51,145] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,550] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,586] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,672] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,700] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,728] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,755] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,784] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,812] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,839] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,867] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,924] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:51,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:51,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:52,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:52,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:52,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:52,038] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:52,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:52,067] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:52,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:52,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:52,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:52,125] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:52,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:52,153] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 13:14:52,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:52,181] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:52,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:52,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:52,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:52,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 13:14:52,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 13:14:52,237] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/mp_rank_00_model_states.pt +[default0]:[2022-10-06 13:14:52,237] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:14:52,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 13:14:52,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 13:14:52,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 13:14:52,452] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 13:14:52,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 13:14:52,464] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 13:14:52,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 13:14:52,466] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 13:14:52,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 13:14:52,459] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 13:14:52,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 13:14:52,454] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 13:14:52,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 13:14:52,436] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 13:14:52,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 13:14:52,481] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 13:14:52,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 13:14:52,455] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 13:14:52,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 13:14:52,560] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 13:14:52,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 13:14:52,560] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 13:14:52,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 13:14:52,564] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 13:14:52,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 13:14:52,558] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 13:14:52,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 13:14:52,601] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 13:14:52,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 13:14:52,560] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 13:14:52,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 13:14:52,559] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 13:14:52,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 13:14:52,583] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 13:14:52,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 13:14:52,574] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 13:14:52,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 13:14:52,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 13:14:52,574] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 13:14:52,538] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 13:14:52,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 13:14:52,553] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 13:14:52,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 13:14:52,617] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 13:14:52,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 13:14:52,560] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 13:14:52,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 13:14:52,562] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 13:14:52,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 13:14:52,561] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 13:14:52,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 13:14:52,560] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 13:14:52,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 13:14:52,614] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 13:14:52,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 13:14:52,702] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 13:14:52,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 13:14:52,700] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 13:14:52,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 13:14:52,691] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default0]: successfully saved checkpoint at iteration 250 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default4]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default1]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default1]:[2022-10-06 13:14:52,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 13:14:52,717] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default4]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default7]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default3]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default6]:[2022-10-06 13:14:52,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 13:14:52,767] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default6]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default7]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default3]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default6]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default5]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default2]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default4]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default1]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default3]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default0]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default6]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default2]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default5]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default7]:[2022-10-06 13:14:52,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 13:14:52,762] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step250/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default7]:time (ms) | save-checkpoint: 1630.89 +[default2]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default0]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default2]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default5]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default3]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default1]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default4]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default5]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default0]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default7]:[2022-10-06 13:14:52,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step250 is ready now! +[default7]: iteration 251/ 6200 | consumed samples: 257024 | consumed tokens: 526385152 | elapsed time per iteration (s): 53.71 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.238784E+00 | loss scale: 512.0 | grad norm: 7.861 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.066 | TFLOPs: 5.81 | +[default7]: iteration 252/ 6200 | consumed samples: 258048 | consumed tokens: 528482304 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.253252E+00 | loss scale: 512.0 | grad norm: 6.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.516 | TFLOPs: 41.92 | +[default7]: iteration 253/ 6200 | consumed samples: 259072 | consumed tokens: 530579456 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.265096E+00 | loss scale: 512.0 | grad norm: 7.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.434 | TFLOPs: 41.89 | +[default7]: iteration 254/ 6200 | consumed samples: 260096 | consumed tokens: 532676608 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.245236E+00 | loss scale: 512.0 | grad norm: 9.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.529 | TFLOPs: 41.92 | +[default7]: iteration 255/ 6200 | consumed samples: 261120 | consumed tokens: 534773760 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.250880E+00 | loss scale: 512.0 | grad norm: 7.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.280 | TFLOPs: 41.84 | +[default7]: iteration 256/ 6200 | consumed samples: 262144 | consumed tokens: 536870912 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.267555E+00 | loss scale: 512.0 | grad norm: 5.825 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.239 | TFLOPs: 41.83 | +[default7]: iteration 257/ 6200 | consumed samples: 263168 | consumed tokens: 538968064 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.248259E+00 | loss scale: 512.0 | grad norm: 6.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.960 | TFLOPs: 42.05 | +[default7]: iteration 258/ 6200 | consumed samples: 264192 | consumed tokens: 541065216 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.273431E+00 | loss scale: 512.0 | grad norm: 6.941 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.641 | TFLOPs: 41.95 | +[default7]: iteration 259/ 6200 | consumed samples: 265216 | consumed tokens: 543162368 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.300520E+00 | loss scale: 512.0 | grad norm: 6.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.970 | TFLOPs: 42.05 | +[default7]: iteration 260/ 6200 | consumed samples: 266240 | consumed tokens: 545259520 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.250601E+00 | loss scale: 512.0 | grad norm: 7.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.888 | TFLOPs: 42.03 | +[default7]: iteration 261/ 6200 | consumed samples: 267264 | consumed tokens: 547356672 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.249010E+00 | loss scale: 512.0 | grad norm: 7.773 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.451 | TFLOPs: 41.90 | +[default7]: iteration 262/ 6200 | consumed samples: 268288 | consumed tokens: 549453824 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.269816E+00 | loss scale: 512.0 | grad norm: 7.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.109 | TFLOPs: 42.10 | +[default7]: iteration 263/ 6200 | consumed samples: 269312 | consumed tokens: 551550976 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.259658E+00 | loss scale: 512.0 | grad norm: 9.743 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.893 | TFLOPs: 42.03 | +[default7]: iteration 264/ 6200 | consumed samples: 270336 | consumed tokens: 553648128 | elapsed time per iteration (s): 9.17 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.278472E+00 | loss scale: 512.0 | grad norm: 7.882 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 111.680 | TFLOPs: 34.04 | +[default7]: iteration 265/ 6200 | consumed samples: 271360 | consumed tokens: 555745280 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.248218E+00 | loss scale: 512.0 | grad norm: 8.678 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.082 | TFLOPs: 42.09 | +[default7]: iteration 266/ 6200 | consumed samples: 272384 | consumed tokens: 557842432 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.237271E+00 | loss scale: 512.0 | grad norm: 9.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.285 | TFLOPs: 42.15 | +[default7]: iteration 267/ 6200 | consumed samples: 273408 | consumed tokens: 559939584 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.241939E+00 | loss scale: 512.0 | grad norm: 6.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.597 | TFLOPs: 42.25 | +[default7]: iteration 268/ 6200 | consumed samples: 274432 | consumed tokens: 562036736 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.243995E+00 | loss scale: 512.0 | grad norm: 12.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.120 | TFLOPs: 42.10 | +[default7]: iteration 269/ 6200 | consumed samples: 275456 | consumed tokens: 564133888 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.247694E+00 | loss scale: 512.0 | grad norm: 8.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.340 | TFLOPs: 42.17 | +[default7]: iteration 270/ 6200 | consumed samples: 276480 | consumed tokens: 566231040 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.262887E+00 | loss scale: 512.0 | grad norm: 7.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.639 | TFLOPs: 41.95 | +[default7]: iteration 271/ 6200 | consumed samples: 277504 | consumed tokens: 568328192 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.265881E+00 | loss scale: 512.0 | grad norm: 8.778 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.509 | TFLOPs: 41.91 | +[default7]: iteration 272/ 6200 | consumed samples: 278528 | consumed tokens: 570425344 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.261761E+00 | loss scale: 512.0 | grad norm: 7.692 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.123 | TFLOPs: 42.10 | +[default7]: iteration 273/ 6200 | consumed samples: 279552 | consumed tokens: 572522496 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.234912E+00 | loss scale: 512.0 | grad norm: 6.882 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.001 | TFLOPs: 42.06 | +[default7]: iteration 274/ 6200 | consumed samples: 280576 | consumed tokens: 574619648 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.236753E+00 | loss scale: 512.0 | grad norm: 7.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.957 | TFLOPs: 42.05 | +[default7]: iteration 275/ 6200 | consumed samples: 281600 | consumed tokens: 576716800 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.246160E+00 | loss scale: 512.0 | grad norm: 7.778 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.295 | TFLOPs: 42.15 | +[default7]: iteration 276/ 6200 | consumed samples: 282624 | consumed tokens: 578813952 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.233337E+00 | loss scale: 512.0 | grad norm: 6.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.077 | TFLOPs: 42.09 | +[default7]: iteration 277/ 6200 | consumed samples: 283648 | consumed tokens: 580911104 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.249277E+00 | loss scale: 512.0 | grad norm: 9.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.681 | TFLOPs: 41.97 | +[default7]: iteration 278/ 6200 | consumed samples: 284672 | consumed tokens: 583008256 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.250739E+00 | loss scale: 512.0 | grad norm: 8.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.211 | TFLOPs: 41.82 | +[default7]: iteration 279/ 6200 | consumed samples: 285696 | consumed tokens: 585105408 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.258326E+00 | loss scale: 512.0 | grad norm: 6.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.097 | TFLOPs: 41.79 | +[default7]: iteration 280/ 6200 | consumed samples: 286720 | consumed tokens: 587202560 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.253127E+00 | loss scale: 512.0 | grad norm: 9.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.761 | TFLOPs: 41.99 | +[default7]: iteration 281/ 6200 | consumed samples: 287744 | consumed tokens: 589299712 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.261188E+00 | loss scale: 512.0 | grad norm: 6.657 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.803 | TFLOPs: 42.00 | +[default7]: iteration 282/ 6200 | consumed samples: 288768 | consumed tokens: 591396864 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.245436E+00 | loss scale: 512.0 | grad norm: 7.735 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.607 | TFLOPs: 41.94 | +[default7]: iteration 283/ 6200 | consumed samples: 289792 | consumed tokens: 593494016 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.233810E+00 | loss scale: 512.0 | grad norm: 12.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.118 | TFLOPs: 42.10 | +[default7]: iteration 284/ 6200 | consumed samples: 290816 | consumed tokens: 595591168 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.265691E+00 | loss scale: 512.0 | grad norm: 5.799 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.297 | TFLOPs: 41.85 | +[default7]: iteration 285/ 6200 | consumed samples: 291840 | consumed tokens: 597688320 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.262865E+00 | loss scale: 512.0 | grad norm: 7.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.289 | TFLOPs: 41.85 | +[default7]: iteration 286/ 6200 | consumed samples: 292864 | consumed tokens: 599785472 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.236196E+00 | loss scale: 512.0 | grad norm: 10.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.489 | TFLOPs: 41.91 | +[default7]: iteration 287/ 6200 | consumed samples: 293888 | consumed tokens: 601882624 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.239239E+00 | loss scale: 512.0 | grad norm: 7.954 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.755 | TFLOPs: 41.99 | +[default7]: iteration 288/ 6200 | consumed samples: 294912 | consumed tokens: 603979776 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.268646E+00 | loss scale: 512.0 | grad norm: 6.949 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.843 | TFLOPs: 42.02 | +[default7]: iteration 289/ 6200 | consumed samples: 295936 | consumed tokens: 606076928 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.229734E+00 | loss scale: 512.0 | grad norm: 6.991 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.740 | TFLOPs: 41.98 | +[default7]: iteration 290/ 6200 | consumed samples: 296960 | consumed tokens: 608174080 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.273962E+00 | loss scale: 512.0 | grad norm: 7.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.701 | TFLOPs: 41.97 | +[default7]: iteration 291/ 6200 | consumed samples: 297984 | consumed tokens: 610271232 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.239628E+00 | loss scale: 512.0 | grad norm: 5.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.708 | TFLOPs: 41.97 | +[default7]: iteration 292/ 6200 | consumed samples: 299008 | consumed tokens: 612368384 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.227209E+00 | loss scale: 512.0 | grad norm: 10.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.723 | TFLOPs: 41.98 | +[default7]: iteration 293/ 6200 | consumed samples: 300032 | consumed tokens: 614465536 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.228228E+00 | loss scale: 512.0 | grad norm: 6.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.675 | TFLOPs: 41.96 | +[default7]: iteration 294/ 6200 | consumed samples: 301056 | consumed tokens: 616562688 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.256846E+00 | loss scale: 512.0 | grad norm: 7.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.748 | TFLOPs: 41.99 | +[default7]: iteration 295/ 6200 | consumed samples: 302080 | consumed tokens: 618659840 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.259079E+00 | loss scale: 512.0 | grad norm: 8.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.449 | TFLOPs: 41.90 | +[default7]: iteration 296/ 6200 | consumed samples: 303104 | consumed tokens: 620756992 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.241565E+00 | loss scale: 512.0 | grad norm: 6.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.369 | TFLOPs: 41.87 | +[default7]: iteration 297/ 6200 | consumed samples: 304128 | consumed tokens: 622854144 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.242965E+00 | loss scale: 512.0 | grad norm: 13.682 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.886 | TFLOPs: 41.72 | +[default7]: iteration 298/ 6200 | consumed samples: 305152 | consumed tokens: 624951296 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.248442E+00 | loss scale: 512.0 | grad norm: 13.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.379 | TFLOPs: 41.87 | +[default7]: iteration 299/ 6200 | consumed samples: 306176 | consumed tokens: 627048448 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.203566E+00 | loss scale: 512.0 | grad norm: 6.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.267 | TFLOPs: 41.84 | +[default7]: iteration 300/ 6200 | consumed samples: 307200 | consumed tokens: 629145600 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.262089E+00 | loss scale: 512.0 | grad norm: 12.982 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.978 | TFLOPs: 42.06 | +[default7]: iteration 301/ 6200 | consumed samples: 308224 | consumed tokens: 631242752 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.273650E+00 | loss scale: 512.0 | grad norm: 11.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.683 | TFLOPs: 41.97 | +[default7]: iteration 302/ 6200 | consumed samples: 309248 | consumed tokens: 633339904 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.229026E+00 | loss scale: 512.0 | grad norm: 8.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.898 | TFLOPs: 42.03 | +[default7]: iteration 303/ 6200 | consumed samples: 310272 | consumed tokens: 635437056 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.215629E+00 | loss scale: 512.0 | grad norm: 6.867 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.304 | TFLOPs: 41.85 | +[default7]: iteration 304/ 6200 | consumed samples: 311296 | consumed tokens: 637534208 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.222796E+00 | loss scale: 512.0 | grad norm: 8.040 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.656 | TFLOPs: 41.96 | +[default7]: iteration 305/ 6200 | consumed samples: 312320 | consumed tokens: 639631360 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.204370E+00 | loss scale: 512.0 | grad norm: 6.927 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.284 | TFLOPs: 41.85 | +[default7]: iteration 306/ 6200 | consumed samples: 313344 | consumed tokens: 641728512 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.257855E+00 | loss scale: 512.0 | grad norm: 7.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.710 | TFLOPs: 41.98 | +[default7]: iteration 307/ 6200 | consumed samples: 314368 | consumed tokens: 643825664 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.210588E+00 | loss scale: 512.0 | grad norm: 6.932 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.633 | TFLOPs: 41.95 | +[default7]: iteration 308/ 6200 | consumed samples: 315392 | consumed tokens: 645922816 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.224705E+00 | loss scale: 512.0 | grad norm: 6.050 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.582 | TFLOPs: 41.94 | +[default7]: iteration 309/ 6200 | consumed samples: 316416 | consumed tokens: 648019968 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.231567E+00 | loss scale: 512.0 | grad norm: 8.819 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.684 | TFLOPs: 41.97 | +[default7]: iteration 310/ 6200 | consumed samples: 317440 | consumed tokens: 650117120 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.232986E+00 | loss scale: 512.0 | grad norm: 7.723 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.273 | TFLOPs: 42.15 | +[default7]: iteration 311/ 6200 | consumed samples: 318464 | consumed tokens: 652214272 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.240313E+00 | loss scale: 512.0 | grad norm: 5.749 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.447 | TFLOPs: 41.89 | +[default7]: iteration 312/ 6200 | consumed samples: 319488 | consumed tokens: 654311424 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.229819E+00 | loss scale: 512.0 | grad norm: 7.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.245 | TFLOPs: 42.14 | +[default7]: iteration 313/ 6200 | consumed samples: 320512 | consumed tokens: 656408576 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.224878E+00 | loss scale: 512.0 | grad norm: 6.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.858 | TFLOPs: 42.02 | +[default7]: iteration 314/ 6200 | consumed samples: 321536 | consumed tokens: 658505728 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.231786E+00 | loss scale: 512.0 | grad norm: 6.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.105 | TFLOPs: 42.10 | +[default7]: iteration 315/ 6200 | consumed samples: 322560 | consumed tokens: 660602880 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.204524E+00 | loss scale: 512.0 | grad norm: 8.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.795 | TFLOPs: 42.00 | +[default7]: iteration 316/ 6200 | consumed samples: 323584 | consumed tokens: 662700032 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.244376E+00 | loss scale: 512.0 | grad norm: 6.916 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.701 | TFLOPs: 41.97 | +[default7]: iteration 317/ 6200 | consumed samples: 324608 | consumed tokens: 664797184 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.241600E+00 | loss scale: 512.0 | grad norm: 8.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.936 | TFLOPs: 42.04 | +[default7]: iteration 318/ 6200 | consumed samples: 325632 | consumed tokens: 666894336 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.227112E+00 | loss scale: 512.0 | grad norm: 6.649 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.744 | TFLOPs: 41.99 | +[default7]: iteration 319/ 6200 | consumed samples: 326656 | consumed tokens: 668991488 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.205815E+00 | loss scale: 512.0 | grad norm: 6.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.457 | TFLOPs: 41.90 | +[default7]: iteration 320/ 6200 | consumed samples: 327680 | consumed tokens: 671088640 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.223571E+00 | loss scale: 512.0 | grad norm: 5.729 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.328 | TFLOPs: 41.86 | +[default7]: iteration 321/ 6200 | consumed samples: 328704 | consumed tokens: 673185792 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.213333E+00 | loss scale: 512.0 | grad norm: 6.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.629 | TFLOPs: 41.95 | +[default7]: iteration 322/ 6200 | consumed samples: 329728 | consumed tokens: 675282944 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.207126E+00 | loss scale: 512.0 | grad norm: 6.912 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.227 | TFLOPs: 41.83 | +[default7]: iteration 323/ 6200 | consumed samples: 330752 | consumed tokens: 677380096 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.230408E+00 | loss scale: 512.0 | grad norm: 6.912 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.527 | TFLOPs: 41.92 | +[default7]: iteration 324/ 6200 | consumed samples: 331776 | consumed tokens: 679477248 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.205774E+00 | loss scale: 512.0 | grad norm: 6.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.783 | TFLOPs: 42.00 | +[default7]: iteration 325/ 6200 | consumed samples: 332800 | consumed tokens: 681574400 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.237024E+00 | loss scale: 512.0 | grad norm: 6.688 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.885 | TFLOPs: 42.03 | +[default7]: iteration 326/ 6200 | consumed samples: 333824 | consumed tokens: 683671552 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.220362E+00 | loss scale: 512.0 | grad norm: 8.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.788 | TFLOPs: 42.00 | +[default7]: iteration 327/ 6200 | consumed samples: 334848 | consumed tokens: 685768704 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.257274E+00 | loss scale: 512.0 | grad norm: 6.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.399 | TFLOPs: 41.88 | +[default7]: iteration 328/ 6200 | consumed samples: 335872 | consumed tokens: 687865856 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.238384E+00 | loss scale: 512.0 | grad norm: 6.937 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.695 | TFLOPs: 41.97 | +[default7]: iteration 329/ 6200 | consumed samples: 336896 | consumed tokens: 689963008 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.211722E+00 | loss scale: 512.0 | grad norm: 6.788 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.705 | TFLOPs: 41.97 | +[default7]: iteration 330/ 6200 | consumed samples: 337920 | consumed tokens: 692060160 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.239987E+00 | loss scale: 512.0 | grad norm: 6.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.129 | TFLOPs: 42.10 | +[default7]: iteration 331/ 6200 | consumed samples: 338944 | consumed tokens: 694157312 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.226895E+00 | loss scale: 512.0 | grad norm: 5.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.030 | TFLOPs: 42.07 | +[default7]: iteration 332/ 6200 | consumed samples: 339968 | consumed tokens: 696254464 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.236789E+00 | loss scale: 512.0 | grad norm: 6.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.320 | TFLOPs: 42.16 | +[default7]: iteration 333/ 6200 | consumed samples: 340992 | consumed tokens: 698351616 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.221235E+00 | loss scale: 512.0 | grad norm: 7.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.772 | TFLOPs: 41.99 | +[default7]: iteration 334/ 6200 | consumed samples: 342016 | consumed tokens: 700448768 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.213769E+00 | loss scale: 512.0 | grad norm: 6.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.888 | TFLOPs: 42.03 | +[default7]: iteration 335/ 6200 | consumed samples: 343040 | consumed tokens: 702545920 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.225195E+00 | loss scale: 512.0 | grad norm: 6.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.336 | TFLOPs: 41.86 | +[default7]: iteration 336/ 6200 | consumed samples: 344064 | consumed tokens: 704643072 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.222757E+00 | loss scale: 512.0 | grad norm: 5.786 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.353 | TFLOPs: 41.87 | +[default7]: iteration 337/ 6200 | consumed samples: 345088 | consumed tokens: 706740224 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.229426E+00 | loss scale: 512.0 | grad norm: 5.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.564 | TFLOPs: 41.93 | +[default7]: iteration 338/ 6200 | consumed samples: 346112 | consumed tokens: 708837376 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.228292E+00 | loss scale: 512.0 | grad norm: 15.814 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.764 | TFLOPs: 41.99 | +[default7]: iteration 339/ 6200 | consumed samples: 347136 | consumed tokens: 710934528 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.224968E+00 | loss scale: 512.0 | grad norm: 7.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.286 | TFLOPs: 41.85 | +[default7]: iteration 340/ 6200 | consumed samples: 348160 | consumed tokens: 713031680 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.219447E+00 | loss scale: 512.0 | grad norm: 6.074 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.636 | TFLOPs: 41.95 | +[default7]: iteration 341/ 6200 | consumed samples: 349184 | consumed tokens: 715128832 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.220902E+00 | loss scale: 512.0 | grad norm: 9.907 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.072 | TFLOPs: 42.09 | +[default7]: iteration 342/ 6200 | consumed samples: 350208 | consumed tokens: 717225984 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.239370E+00 | loss scale: 512.0 | grad norm: 6.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.567 | TFLOPs: 41.93 | +[default7]: iteration 343/ 6200 | consumed samples: 351232 | consumed tokens: 719323136 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.218944E+00 | loss scale: 512.0 | grad norm: 6.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.583 | TFLOPs: 41.94 | +[default7]: iteration 344/ 6200 | consumed samples: 352256 | consumed tokens: 721420288 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.210586E+00 | loss scale: 512.0 | grad norm: 10.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.845 | TFLOPs: 42.02 | +[default7]: iteration 345/ 6200 | consumed samples: 353280 | consumed tokens: 723517440 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.209209E+00 | loss scale: 512.0 | grad norm: 10.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.467 | TFLOPs: 42.21 | +[default7]: iteration 346/ 6200 | consumed samples: 354304 | consumed tokens: 725614592 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.214682E+00 | loss scale: 512.0 | grad norm: 7.862 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.031 | TFLOPs: 42.07 | +[default7]: iteration 347/ 6200 | consumed samples: 355328 | consumed tokens: 727711744 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.217156E+00 | loss scale: 512.0 | grad norm: 10.027 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.691 | TFLOPs: 42.27 | +[default7]: iteration 348/ 6200 | consumed samples: 356352 | consumed tokens: 729808896 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.189163E+00 | loss scale: 512.0 | grad norm: 9.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.367 | TFLOPs: 42.18 | +[default7]: iteration 349/ 6200 | consumed samples: 357376 | consumed tokens: 731906048 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.226360E+00 | loss scale: 512.0 | grad norm: 10.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.252 | TFLOPs: 42.14 | +[default7]: iteration 350/ 6200 | consumed samples: 358400 | consumed tokens: 734003200 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.205855E+00 | loss scale: 512.0 | grad norm: 8.817 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.121 | TFLOPs: 42.10 | +[default7]: iteration 351/ 6200 | consumed samples: 359424 | consumed tokens: 736100352 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.282279E+00 | loss scale: 512.0 | grad norm: 9.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.760 | TFLOPs: 41.99 | +[default7]: iteration 352/ 6200 | consumed samples: 360448 | consumed tokens: 738197504 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.225344E+00 | loss scale: 512.0 | grad norm: 8.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.137 | TFLOPs: 42.11 | +[default7]: iteration 353/ 6200 | consumed samples: 361472 | consumed tokens: 740294656 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.212989E+00 | loss scale: 512.0 | grad norm: 6.799 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.654 | TFLOPs: 41.96 | +[default7]: iteration 354/ 6200 | consumed samples: 362496 | consumed tokens: 742391808 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.225188E+00 | loss scale: 512.0 | grad norm: 8.793 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.750 | TFLOPs: 41.99 | +[default7]: iteration 355/ 6200 | consumed samples: 363520 | consumed tokens: 744488960 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.199508E+00 | loss scale: 512.0 | grad norm: 6.782 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.834 | TFLOPs: 42.01 | +[default7]: iteration 356/ 6200 | consumed samples: 364544 | consumed tokens: 746586112 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.203697E+00 | loss scale: 512.0 | grad norm: 6.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.199 | TFLOPs: 42.12 | +[default7]: iteration 357/ 6200 | consumed samples: 365568 | consumed tokens: 748683264 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.197691E+00 | loss scale: 512.0 | grad norm: 6.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.061 | TFLOPs: 42.08 | +[default7]: iteration 358/ 6200 | consumed samples: 366592 | consumed tokens: 750780416 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.208000E+00 | loss scale: 512.0 | grad norm: 6.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.990 | TFLOPs: 42.06 | +[default7]: iteration 359/ 6200 | consumed samples: 367616 | consumed tokens: 752877568 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.202459E+00 | loss scale: 512.0 | grad norm: 8.005 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.843 | TFLOPs: 42.02 | +[default7]: iteration 360/ 6200 | consumed samples: 368640 | consumed tokens: 754974720 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.192418E+00 | loss scale: 512.0 | grad norm: 7.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.729 | TFLOPs: 41.98 | +[default7]: iteration 361/ 6200 | consumed samples: 369664 | consumed tokens: 757071872 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.187928E+00 | loss scale: 512.0 | grad norm: 6.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.391 | TFLOPs: 42.18 | +[default7]: iteration 362/ 6200 | consumed samples: 370688 | consumed tokens: 759169024 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.194613E+00 | loss scale: 512.0 | grad norm: 6.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.147 | TFLOPs: 42.11 | +[default7]: iteration 363/ 6200 | consumed samples: 371712 | consumed tokens: 761266176 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.210244E+00 | loss scale: 512.0 | grad norm: 8.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.151 | TFLOPs: 42.11 | +[default7]: iteration 364/ 6200 | consumed samples: 372736 | consumed tokens: 763363328 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.236827E+00 | loss scale: 512.0 | grad norm: 7.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.810 | TFLOPs: 42.01 | +[default7]: iteration 365/ 6200 | consumed samples: 373760 | consumed tokens: 765460480 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.203139E+00 | loss scale: 512.0 | grad norm: 6.024 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.512 | TFLOPs: 41.91 | +[default7]: iteration 366/ 6200 | consumed samples: 374784 | consumed tokens: 767557632 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.214463E+00 | loss scale: 512.0 | grad norm: 6.837 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.861 | TFLOPs: 42.02 | +[default7]: iteration 367/ 6200 | consumed samples: 375808 | consumed tokens: 769654784 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.175058E+00 | loss scale: 512.0 | grad norm: 6.016 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.832 | TFLOPs: 42.01 | +[default7]: iteration 368/ 6200 | consumed samples: 376832 | consumed tokens: 771751936 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.203408E+00 | loss scale: 512.0 | grad norm: 5.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.382 | TFLOPs: 41.88 | +[default7]: iteration 369/ 6200 | consumed samples: 377856 | consumed tokens: 773849088 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.202764E+00 | loss scale: 512.0 | grad norm: 6.907 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.504 | TFLOPs: 41.91 | +[default7]: iteration 370/ 6200 | consumed samples: 378880 | consumed tokens: 775946240 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.234317E+00 | loss scale: 512.0 | grad norm: 6.035 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.814 | TFLOPs: 42.01 | +[default7]: iteration 371/ 6200 | consumed samples: 379904 | consumed tokens: 778043392 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.206966E+00 | loss scale: 512.0 | grad norm: 5.862 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.511 | TFLOPs: 41.91 | +[default7]: iteration 372/ 6200 | consumed samples: 380928 | consumed tokens: 780140544 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.198517E+00 | loss scale: 512.0 | grad norm: 6.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.997 | TFLOPs: 42.06 | +[default7]: iteration 373/ 6200 | consumed samples: 381952 | consumed tokens: 782237696 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.189731E+00 | loss scale: 512.0 | grad norm: 6.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.371 | TFLOPs: 42.18 | +[default7]: iteration 374/ 6200 | consumed samples: 382976 | consumed tokens: 784334848 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.189438E+00 | loss scale: 512.0 | grad norm: 7.968 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.462 | TFLOPs: 42.20 | +[default7]: iteration 375/ 6200 | consumed samples: 384000 | consumed tokens: 786432000 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.197859E+00 | loss scale: 512.0 | grad norm: 8.018 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.121 | TFLOPs: 42.10 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 375 | lm loss value: 3.374798E+00 | lm loss PPL: 2.921837E+01 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default7]:---------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 375 | lm loss value: 2.105509E+00 | lm loss PPL: 8.211278E+00 | +[default7]:---------------------------------------------------------------------------------------------- +[default7]: iteration 376/ 6200 | consumed samples: 385024 | consumed tokens: 788529152 | elapsed time per iteration (s): 52.60 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.166426E+00 | loss scale: 512.0 | grad norm: 6.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.468 | TFLOPs: 5.93 | +[default7]: iteration 377/ 6200 | consumed samples: 386048 | consumed tokens: 790626304 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.205758E+00 | loss scale: 512.0 | grad norm: 6.841 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.119 | TFLOPs: 42.10 | +[default7]: iteration 378/ 6200 | consumed samples: 387072 | consumed tokens: 792723456 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.217987E+00 | loss scale: 512.0 | grad norm: 8.905 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.237 | TFLOPs: 42.14 | +[default7]: iteration 379/ 6200 | consumed samples: 388096 | consumed tokens: 794820608 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.189454E+00 | loss scale: 512.0 | grad norm: 6.923 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.445 | TFLOPs: 42.20 | +[default7]: iteration 380/ 6200 | consumed samples: 389120 | consumed tokens: 796917760 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.202500E+00 | loss scale: 512.0 | grad norm: 6.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.443 | TFLOPs: 42.20 | +[default7]: iteration 381/ 6200 | consumed samples: 390144 | consumed tokens: 799014912 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.226776E+00 | loss scale: 512.0 | grad norm: 8.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.822 | TFLOPs: 42.01 | +[default7]: iteration 382/ 6200 | consumed samples: 391168 | consumed tokens: 801112064 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.200954E+00 | loss scale: 512.0 | grad norm: 7.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.771 | TFLOPs: 41.99 | +[default7]: iteration 383/ 6200 | consumed samples: 392192 | consumed tokens: 803209216 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.199775E+00 | loss scale: 512.0 | grad norm: 6.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.486 | TFLOPs: 41.91 | +[default7]: iteration 384/ 6200 | consumed samples: 393216 | consumed tokens: 805306368 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.204959E+00 | loss scale: 512.0 | grad norm: 9.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.129 | TFLOPs: 41.80 | +[default7]: iteration 385/ 6200 | consumed samples: 394240 | consumed tokens: 807403520 | elapsed time per iteration (s): 7.52 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.171720E+00 | loss scale: 512.0 | grad norm: 7.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.190 | TFLOPs: 41.51 | +[default7]: iteration 386/ 6200 | consumed samples: 395264 | consumed tokens: 809500672 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.220938E+00 | loss scale: 512.0 | grad norm: 9.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.755 | TFLOPs: 41.99 | +[default7]: iteration 387/ 6200 | consumed samples: 396288 | consumed tokens: 811597824 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.180085E+00 | loss scale: 512.0 | grad norm: 8.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.879 | TFLOPs: 42.03 | +[default7]: iteration 388/ 6200 | consumed samples: 397312 | consumed tokens: 813694976 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.183252E+00 | loss scale: 512.0 | grad norm: 8.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.189 | TFLOPs: 42.12 | +[default7]: iteration 389/ 6200 | consumed samples: 398336 | consumed tokens: 815792128 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.226209E+00 | loss scale: 512.0 | grad norm: 7.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.922 | TFLOPs: 42.04 | +[default7]: iteration 390/ 6200 | consumed samples: 399360 | consumed tokens: 817889280 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.187960E+00 | loss scale: 512.0 | grad norm: 8.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.772 | TFLOPs: 41.99 | +[default7]: iteration 391/ 6200 | consumed samples: 400384 | consumed tokens: 819986432 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.210975E+00 | loss scale: 512.0 | grad norm: 8.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.962 | TFLOPs: 42.05 | +[default7]: iteration 392/ 6200 | consumed samples: 401408 | consumed tokens: 822083584 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.201356E+00 | loss scale: 512.0 | grad norm: 6.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.337 | TFLOPs: 41.86 | +[default7]: iteration 393/ 6200 | consumed samples: 402432 | consumed tokens: 824180736 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.195940E+00 | loss scale: 512.0 | grad norm: 5.761 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.543 | TFLOPs: 41.92 | +[default7]: iteration 394/ 6200 | consumed samples: 403456 | consumed tokens: 826277888 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.194417E+00 | loss scale: 512.0 | grad norm: 10.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.494 | TFLOPs: 41.91 | +[default7]: iteration 395/ 6200 | consumed samples: 404480 | consumed tokens: 828375040 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.188719E+00 | loss scale: 512.0 | grad norm: 7.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.706 | TFLOPs: 41.97 | +[default7]: iteration 396/ 6200 | consumed samples: 405504 | consumed tokens: 830472192 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.175831E+00 | loss scale: 512.0 | grad norm: 7.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.132 | TFLOPs: 42.10 | +[default7]: iteration 397/ 6200 | consumed samples: 406528 | consumed tokens: 832569344 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.210867E+00 | loss scale: 512.0 | grad norm: 7.067 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.759 | TFLOPs: 41.99 | +[default7]: iteration 398/ 6200 | consumed samples: 407552 | consumed tokens: 834666496 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.176151E+00 | loss scale: 512.0 | grad norm: 7.522 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.897 | TFLOPs: 42.03 | +[default7]: iteration 399/ 6200 | consumed samples: 408576 | consumed tokens: 836763648 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.180943E+00 | loss scale: 512.0 | grad norm: 6.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.748 | TFLOPs: 41.99 | +[default7]: iteration 400/ 6200 | consumed samples: 409600 | consumed tokens: 838860800 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.183379E+00 | loss scale: 512.0 | grad norm: 6.594 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.182 | TFLOPs: 42.12 | +[default7]: iteration 401/ 6200 | consumed samples: 410624 | consumed tokens: 840957952 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.183820E+00 | loss scale: 512.0 | grad norm: 7.046 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.646 | TFLOPs: 41.96 | +[default7]: iteration 402/ 6200 | consumed samples: 411648 | consumed tokens: 843055104 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.183791E+00 | loss scale: 512.0 | grad norm: 7.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.366 | TFLOPs: 42.18 | +[default7]: iteration 403/ 6200 | consumed samples: 412672 | consumed tokens: 845152256 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.181470E+00 | loss scale: 512.0 | grad norm: 8.681 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.179 | TFLOPs: 42.12 | +[default7]: iteration 404/ 6200 | consumed samples: 413696 | consumed tokens: 847249408 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.166819E+00 | loss scale: 512.0 | grad norm: 7.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.039 | TFLOPs: 42.08 | +[default7]: iteration 405/ 6200 | consumed samples: 414720 | consumed tokens: 849346560 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.172935E+00 | loss scale: 512.0 | grad norm: 8.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.909 | TFLOPs: 42.04 | +[default7]: iteration 406/ 6200 | consumed samples: 415744 | consumed tokens: 851443712 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.176815E+00 | loss scale: 512.0 | grad norm: 9.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.921 | TFLOPs: 42.04 | +[default7]: iteration 407/ 6200 | consumed samples: 416768 | consumed tokens: 853540864 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.184901E+00 | loss scale: 512.0 | grad norm: 6.792 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.697 | TFLOPs: 41.97 | +[default7]: iteration 408/ 6200 | consumed samples: 417792 | consumed tokens: 855638016 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.152150E+00 | loss scale: 512.0 | grad norm: 7.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.701 | TFLOPs: 41.97 | +[default7]: iteration 409/ 6200 | consumed samples: 418816 | consumed tokens: 857735168 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.177510E+00 | loss scale: 512.0 | grad norm: 10.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.081 | TFLOPs: 42.09 | +[default7]: iteration 410/ 6200 | consumed samples: 419840 | consumed tokens: 859832320 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.191621E+00 | loss scale: 512.0 | grad norm: 6.678 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.961 | TFLOPs: 42.05 | +[default7]: iteration 411/ 6200 | consumed samples: 420864 | consumed tokens: 861929472 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.177199E+00 | loss scale: 512.0 | grad norm: 8.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.731 | TFLOPs: 41.98 | +[default7]: iteration 412/ 6200 | consumed samples: 421888 | consumed tokens: 864026624 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.187060E+00 | loss scale: 512.0 | grad norm: 6.989 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.728 | TFLOPs: 41.98 | +[default7]: iteration 413/ 6200 | consumed samples: 422912 | consumed tokens: 866123776 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.157162E+00 | loss scale: 512.0 | grad norm: 10.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.735 | TFLOPs: 41.98 | +[default7]: iteration 414/ 6200 | consumed samples: 423936 | consumed tokens: 868220928 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.173003E+00 | loss scale: 512.0 | grad norm: 9.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.701 | TFLOPs: 41.97 | +[default7]: iteration 415/ 6200 | consumed samples: 424960 | consumed tokens: 870318080 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.168359E+00 | loss scale: 512.0 | grad norm: 10.571 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.697 | TFLOPs: 41.97 | +[default7]: iteration 416/ 6200 | consumed samples: 425984 | consumed tokens: 872415232 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.174932E+00 | loss scale: 512.0 | grad norm: 6.969 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.163 | TFLOPs: 41.81 | +[default7]: iteration 417/ 6200 | consumed samples: 427008 | consumed tokens: 874512384 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.179733E+00 | loss scale: 512.0 | grad norm: 9.022 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.367 | TFLOPs: 41.87 | +[default7]: iteration 418/ 6200 | consumed samples: 428032 | consumed tokens: 876609536 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.196916E+00 | loss scale: 512.0 | grad norm: 7.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.584 | TFLOPs: 41.94 | +[default7]: iteration 419/ 6200 | consumed samples: 429056 | consumed tokens: 878706688 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.175304E+00 | loss scale: 512.0 | grad norm: 7.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.385 | TFLOPs: 41.88 | +[default7]: iteration 420/ 6200 | consumed samples: 430080 | consumed tokens: 880803840 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.167994E+00 | loss scale: 512.0 | grad norm: 8.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.933 | TFLOPs: 41.74 | +[default7]: iteration 421/ 6200 | consumed samples: 431104 | consumed tokens: 882900992 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.168328E+00 | loss scale: 512.0 | grad norm: 7.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.158 | TFLOPs: 41.81 | +[default7]: iteration 422/ 6200 | consumed samples: 432128 | consumed tokens: 884998144 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.198473E+00 | loss scale: 512.0 | grad norm: 9.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.401 | TFLOPs: 41.88 | +[default7]: iteration 423/ 6200 | consumed samples: 433152 | consumed tokens: 887095296 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.176053E+00 | loss scale: 512.0 | grad norm: 8.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.144 | TFLOPs: 41.80 | +[default7]: iteration 424/ 6200 | consumed samples: 434176 | consumed tokens: 889192448 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.187797E+00 | loss scale: 512.0 | grad norm: 6.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.118 | TFLOPs: 41.79 | +[default7]: iteration 425/ 6200 | consumed samples: 435200 | consumed tokens: 891289600 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.181390E+00 | loss scale: 512.0 | grad norm: 7.766 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.832 | TFLOPs: 41.71 | +[default7]: iteration 426/ 6200 | consumed samples: 436224 | consumed tokens: 893386752 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.200203E+00 | loss scale: 512.0 | grad norm: 7.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.303 | TFLOPs: 41.85 | +[default7]: iteration 427/ 6200 | consumed samples: 437248 | consumed tokens: 895483904 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.138978E+00 | loss scale: 512.0 | grad norm: 5.944 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.299 | TFLOPs: 41.85 | +[default7]: iteration 428/ 6200 | consumed samples: 438272 | consumed tokens: 897581056 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.180504E+00 | loss scale: 512.0 | grad norm: 7.573 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.281 | TFLOPs: 41.84 | +[default7]: iteration 429/ 6200 | consumed samples: 439296 | consumed tokens: 899678208 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.181091E+00 | loss scale: 512.0 | grad norm: 6.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.198 | TFLOPs: 41.82 | +[default7]: iteration 430/ 6200 | consumed samples: 440320 | consumed tokens: 901775360 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.154155E+00 | loss scale: 512.0 | grad norm: 5.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.928 | TFLOPs: 41.74 | +[default7]: iteration 431/ 6200 | consumed samples: 441344 | consumed tokens: 903872512 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.149670E+00 | loss scale: 512.0 | grad norm: 6.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.193 | TFLOPs: 41.82 | +[default7]: iteration 432/ 6200 | consumed samples: 442368 | consumed tokens: 905969664 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.165956E+00 | loss scale: 512.0 | grad norm: 6.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.355 | TFLOPs: 41.87 | +[default7]: iteration 433/ 6200 | consumed samples: 443392 | consumed tokens: 908066816 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.172493E+00 | loss scale: 512.0 | grad norm: 5.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.281 | TFLOPs: 42.15 | +[default7]: iteration 434/ 6200 | consumed samples: 444416 | consumed tokens: 910163968 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.186103E+00 | loss scale: 512.0 | grad norm: 8.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.579 | TFLOPs: 42.24 | +[default7]: iteration 435/ 6200 | consumed samples: 445440 | consumed tokens: 912261120 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.149826E+00 | loss scale: 512.0 | grad norm: 6.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.603 | TFLOPs: 42.25 | +[default7]: iteration 436/ 6200 | consumed samples: 446464 | consumed tokens: 914358272 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.181599E+00 | loss scale: 512.0 | grad norm: 6.774 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.110 | TFLOPs: 42.10 | +[default7]: iteration 437/ 6200 | consumed samples: 447488 | consumed tokens: 916455424 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.189258E+00 | loss scale: 512.0 | grad norm: 7.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.838 | TFLOPs: 42.01 | +[default7]: iteration 438/ 6200 | consumed samples: 448512 | consumed tokens: 918552576 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.152655E+00 | loss scale: 512.0 | grad norm: 5.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.290 | TFLOPs: 42.15 | +[default7]: iteration 439/ 6200 | consumed samples: 449536 | consumed tokens: 920649728 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.145385E+00 | loss scale: 512.0 | grad norm: 7.978 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.971 | TFLOPs: 42.05 | +[default7]: iteration 440/ 6200 | consumed samples: 450560 | consumed tokens: 922746880 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.169031E+00 | loss scale: 512.0 | grad norm: 6.022 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.122 | TFLOPs: 42.10 | +[default7]: iteration 441/ 6200 | consumed samples: 451584 | consumed tokens: 924844032 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.172901E+00 | loss scale: 512.0 | grad norm: 6.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.504 | TFLOPs: 42.22 | +[default7]: iteration 442/ 6200 | consumed samples: 452608 | consumed tokens: 926941184 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.172733E+00 | loss scale: 512.0 | grad norm: 6.058 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.295 | TFLOPs: 42.15 | +[default7]: iteration 443/ 6200 | consumed samples: 453632 | consumed tokens: 929038336 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.159406E+00 | loss scale: 512.0 | grad norm: 6.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.522 | TFLOPs: 42.22 | +[default7]: iteration 444/ 6200 | consumed samples: 454656 | consumed tokens: 931135488 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.187256E+00 | loss scale: 512.0 | grad norm: 5.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.924 | TFLOPs: 42.04 | +[default7]: iteration 445/ 6200 | consumed samples: 455680 | consumed tokens: 933232640 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.164047E+00 | loss scale: 512.0 | grad norm: 6.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.696 | TFLOPs: 41.97 | +[default7]: iteration 446/ 6200 | consumed samples: 456704 | consumed tokens: 935329792 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.191401E+00 | loss scale: 512.0 | grad norm: 6.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.662 | TFLOPs: 41.96 | +[default7]: iteration 447/ 6200 | consumed samples: 457728 | consumed tokens: 937426944 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.166166E+00 | loss scale: 512.0 | grad norm: 6.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.871 | TFLOPs: 42.02 | +[default7]: iteration 448/ 6200 | consumed samples: 458752 | consumed tokens: 939524096 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.171421E+00 | loss scale: 512.0 | grad norm: 7.991 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.084 | TFLOPs: 41.78 | +[default7]: iteration 449/ 6200 | consumed samples: 459776 | consumed tokens: 941621248 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.174603E+00 | loss scale: 512.0 | grad norm: 6.049 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.997 | TFLOPs: 41.76 | +[default7]: iteration 450/ 6200 | consumed samples: 460800 | consumed tokens: 943718400 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.153743E+00 | loss scale: 512.0 | grad norm: 6.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.964 | TFLOPs: 41.75 | +[default7]: iteration 451/ 6200 | consumed samples: 461824 | consumed tokens: 945815552 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.163117E+00 | loss scale: 512.0 | grad norm: 5.682 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.489 | TFLOPs: 42.21 | +[default7]: iteration 452/ 6200 | consumed samples: 462848 | consumed tokens: 947912704 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.162695E+00 | loss scale: 512.0 | grad norm: 7.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.073 | TFLOPs: 42.09 | +[default7]: iteration 453/ 6200 | consumed samples: 463872 | consumed tokens: 950009856 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.162932E+00 | loss scale: 512.0 | grad norm: 6.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.427 | TFLOPs: 42.19 | +[default7]: iteration 454/ 6200 | consumed samples: 464896 | consumed tokens: 952107008 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.175904E+00 | loss scale: 512.0 | grad norm: 5.813 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.330 | TFLOPs: 42.16 | +[default7]: iteration 455/ 6200 | consumed samples: 465920 | consumed tokens: 954204160 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.151876E+00 | loss scale: 512.0 | grad norm: 8.939 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.440 | TFLOPs: 42.20 | +[default7]: iteration 456/ 6200 | consumed samples: 466944 | consumed tokens: 956301312 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.148168E+00 | loss scale: 512.0 | grad norm: 6.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.949 | TFLOPs: 42.05 | +[default7]: iteration 457/ 6200 | consumed samples: 467968 | consumed tokens: 958398464 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.135580E+00 | loss scale: 512.0 | grad norm: 6.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.959 | TFLOPs: 41.75 | +[default7]: iteration 458/ 6200 | consumed samples: 468992 | consumed tokens: 960495616 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.164261E+00 | loss scale: 512.0 | grad norm: 6.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.650 | TFLOPs: 41.96 | +[default7]: iteration 459/ 6200 | consumed samples: 470016 | consumed tokens: 962592768 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.158025E+00 | loss scale: 512.0 | grad norm: 6.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.265 | TFLOPs: 41.84 | +[default7]: iteration 460/ 6200 | consumed samples: 471040 | consumed tokens: 964689920 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.160103E+00 | loss scale: 512.0 | grad norm: 6.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.492 | TFLOPs: 42.21 | +[default7]: iteration 461/ 6200 | consumed samples: 472064 | consumed tokens: 966787072 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.169669E+00 | loss scale: 512.0 | grad norm: 7.838 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.496 | TFLOPs: 42.21 | +[default7]: iteration 462/ 6200 | consumed samples: 473088 | consumed tokens: 968884224 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.156846E+00 | loss scale: 512.0 | grad norm: 5.755 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.518 | TFLOPs: 42.22 | +[default7]: iteration 463/ 6200 | consumed samples: 474112 | consumed tokens: 970981376 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.204983E+00 | loss scale: 512.0 | grad norm: 8.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.151 | TFLOPs: 42.11 | +[default7]: iteration 464/ 6200 | consumed samples: 475136 | consumed tokens: 973078528 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.167316E+00 | loss scale: 512.0 | grad norm: 5.655 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.185 | TFLOPs: 42.12 | +[default7]: iteration 465/ 6200 | consumed samples: 476160 | consumed tokens: 975175680 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.157420E+00 | loss scale: 512.0 | grad norm: 5.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.808 | TFLOPs: 42.00 | +[default7]: iteration 466/ 6200 | consumed samples: 477184 | consumed tokens: 977272832 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.169822E+00 | loss scale: 512.0 | grad norm: 6.032 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.753 | TFLOPs: 41.99 | +[default7]: iteration 467/ 6200 | consumed samples: 478208 | consumed tokens: 979369984 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.147315E+00 | loss scale: 512.0 | grad norm: 6.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.052 | TFLOPs: 42.08 | +[default7]: iteration 468/ 6200 | consumed samples: 479232 | consumed tokens: 981467136 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.177772E+00 | loss scale: 512.0 | grad norm: 6.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.892 | TFLOPs: 42.03 | +[default7]: iteration 469/ 6200 | consumed samples: 480256 | consumed tokens: 983564288 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.162268E+00 | loss scale: 512.0 | grad norm: 5.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.896 | TFLOPs: 42.03 | +[default7]: iteration 470/ 6200 | consumed samples: 481280 | consumed tokens: 985661440 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.153745E+00 | loss scale: 512.0 | grad norm: 5.896 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.139 | TFLOPs: 42.11 | +[default7]: iteration 471/ 6200 | consumed samples: 482304 | consumed tokens: 987758592 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.162363E+00 | loss scale: 512.0 | grad norm: 8.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.333 | TFLOPs: 42.17 | +[default7]: iteration 472/ 6200 | consumed samples: 483328 | consumed tokens: 989855744 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.163019E+00 | loss scale: 512.0 | grad norm: 7.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.914 | TFLOPs: 42.04 | +[default7]: iteration 473/ 6200 | consumed samples: 484352 | consumed tokens: 991952896 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.154898E+00 | loss scale: 512.0 | grad norm: 9.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.359 | TFLOPs: 42.17 | +[default7]: iteration 474/ 6200 | consumed samples: 485376 | consumed tokens: 994050048 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.142533E+00 | loss scale: 512.0 | grad norm: 5.875 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.947 | TFLOPs: 42.05 | +[default7]: iteration 475/ 6200 | consumed samples: 486400 | consumed tokens: 996147200 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.146974E+00 | loss scale: 512.0 | grad norm: 7.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.460 | TFLOPs: 42.20 | +[default7]: iteration 476/ 6200 | consumed samples: 487424 | consumed tokens: 998244352 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.165667E+00 | loss scale: 512.0 | grad norm: 5.789 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.517 | TFLOPs: 42.22 | +[default7]: iteration 477/ 6200 | consumed samples: 488448 | consumed tokens: 1000341504 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.108260E+00 | loss scale: 512.0 | grad norm: 7.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.143 | TFLOPs: 42.11 | +[default7]: iteration 478/ 6200 | consumed samples: 489472 | consumed tokens: 1002438656 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.130200E+00 | loss scale: 512.0 | grad norm: 7.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.106 | TFLOPs: 41.79 | +[default7]: iteration 479/ 6200 | consumed samples: 490496 | consumed tokens: 1004535808 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.151065E+00 | loss scale: 512.0 | grad norm: 7.748 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.864 | TFLOPs: 41.72 | +[default7]: iteration 480/ 6200 | consumed samples: 491520 | consumed tokens: 1006632960 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.157566E+00 | loss scale: 512.0 | grad norm: 5.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.738 | TFLOPs: 41.68 | +[default7]: iteration 481/ 6200 | consumed samples: 492544 | consumed tokens: 1008730112 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.142816E+00 | loss scale: 512.0 | grad norm: 6.056 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.118 | TFLOPs: 42.10 | +[default7]: iteration 482/ 6200 | consumed samples: 493568 | consumed tokens: 1010827264 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.141651E+00 | loss scale: 512.0 | grad norm: 7.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.418 | TFLOPs: 42.19 | +[default7]: iteration 483/ 6200 | consumed samples: 494592 | consumed tokens: 1012924416 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.145586E+00 | loss scale: 512.0 | grad norm: 8.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.384 | TFLOPs: 42.18 | +[default7]: iteration 484/ 6200 | consumed samples: 495616 | consumed tokens: 1015021568 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.145841E+00 | loss scale: 512.0 | grad norm: 7.594 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.372 | TFLOPs: 42.18 | +[default7]: iteration 485/ 6200 | consumed samples: 496640 | consumed tokens: 1017118720 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.154597E+00 | loss scale: 512.0 | grad norm: 7.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.307 | TFLOPs: 42.16 | +[default7]: iteration 486/ 6200 | consumed samples: 497664 | consumed tokens: 1019215872 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.155770E+00 | loss scale: 512.0 | grad norm: 6.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.597 | TFLOPs: 41.94 | +[default7]: iteration 487/ 6200 | consumed samples: 498688 | consumed tokens: 1021313024 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.164232E+00 | loss scale: 512.0 | grad norm: 7.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.251 | TFLOPs: 41.84 | +[default7]: iteration 488/ 6200 | consumed samples: 499712 | consumed tokens: 1023410176 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.144829E+00 | loss scale: 512.0 | grad norm: 5.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.894 | TFLOPs: 41.73 | +[default7]: iteration 489/ 6200 | consumed samples: 500736 | consumed tokens: 1025507328 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.168864E+00 | loss scale: 512.0 | grad norm: 6.936 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.197 | TFLOPs: 41.82 | +[default7]: iteration 490/ 6200 | consumed samples: 501760 | consumed tokens: 1027604480 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.161659E+00 | loss scale: 512.0 | grad norm: 5.879 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.552 | TFLOPs: 41.62 | +[default7]: iteration 491/ 6200 | consumed samples: 502784 | consumed tokens: 1029701632 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.150779E+00 | loss scale: 512.0 | grad norm: 6.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.009 | TFLOPs: 41.76 | +[default7]: iteration 492/ 6200 | consumed samples: 503808 | consumed tokens: 1031798784 | elapsed time per iteration (s): 7.51 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.133628E+00 | loss scale: 512.0 | grad norm: 6.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.295 | TFLOPs: 41.54 | +[default7]: iteration 493/ 6200 | consumed samples: 504832 | consumed tokens: 1033895936 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.168754E+00 | loss scale: 512.0 | grad norm: 6.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.673 | TFLOPs: 41.66 | +[default7]: iteration 494/ 6200 | consumed samples: 505856 | consumed tokens: 1035993088 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.134811E+00 | loss scale: 512.0 | grad norm: 7.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.649 | TFLOPs: 41.65 | +[default7]: iteration 495/ 6200 | consumed samples: 506880 | consumed tokens: 1038090240 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.144293E+00 | loss scale: 512.0 | grad norm: 7.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.937 | TFLOPs: 41.74 | +[default7]: iteration 496/ 6200 | consumed samples: 507904 | consumed tokens: 1040187392 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.137041E+00 | loss scale: 512.0 | grad norm: 7.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.169 | TFLOPs: 41.81 | +[default7]: iteration 497/ 6200 | consumed samples: 508928 | consumed tokens: 1042284544 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.168288E+00 | loss scale: 512.0 | grad norm: 7.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.713 | TFLOPs: 41.67 | +[default7]: iteration 498/ 6200 | consumed samples: 509952 | consumed tokens: 1044381696 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.132202E+00 | loss scale: 512.0 | grad norm: 6.984 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.952 | TFLOPs: 41.74 | +[default7]: iteration 499/ 6200 | consumed samples: 510976 | consumed tokens: 1046478848 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.150365E+00 | loss scale: 512.0 | grad norm: 5.792 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.141 | TFLOPs: 41.80 | +[default7]: iteration 500/ 6200 | consumed samples: 512000 | consumed tokens: 1048576000 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.107739E+00 | loss scale: 512.0 | grad norm: 6.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.069 | TFLOPs: 41.78 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 500 | lm loss value: 3.389790E+00 | lm loss PPL: 2.965973E+01 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 500 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 13:47:22,769] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step500 is begin to save! +[default0]:[2022-10-06 13:47:22,774] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_01-model_00-model_states.pt... +[default7]:---------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 500 | lm loss value: 2.046201E+00 | lm loss PPL: 7.738447E+00 | +[default7]:---------------------------------------------------------------------------------------------- +[default0]:[2022-10-06 13:47:23,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,176] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,261] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,289] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,624] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,652] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,677] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,731] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,758] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,813] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,839] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,865] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,891] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,919] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,946] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:23,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:23,974] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:24,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:24,000] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:24,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:24,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:24,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:24,074] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:24,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:24,101] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_25-model_00-model_states.pt... +[default2]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 13:47:24,170] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 13:47:24,170] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 13:47:24,170] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 13:47:24,170] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 13:47:24,170] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 13:47:24,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:24,127] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:24,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:24,152] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 13:47:24,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 13:47:24,154] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/mp_rank_00_model_states.pt +[default0]:[2022-10-06 13:47:24,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 13:47:24,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 13:47:24,170] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 13:47:24,170] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 13:47:24,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 13:47:24,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 13:47:24,345] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 13:47:24,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 13:47:24,431] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 13:47:24,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 13:47:24,447] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 13:47:24,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 13:47:24,470] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 13:47:24,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 13:47:24,443] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 13:47:24,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 13:47:24,446] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 13:47:24,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 13:47:24,491] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 13:47:24,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 13:47:24,475] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 13:47:24,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 13:47:24,557] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 13:47:24,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 13:47:24,509] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 13:47:24,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 13:47:24,494] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 13:47:24,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 13:47:24,591] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 13:47:24,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 13:47:24,529] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 13:47:24,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 13:47:24,528] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 13:47:24,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 13:47:24,577] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 13:47:24,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 13:47:24,568] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 13:47:24,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 13:47:24,609] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 13:47:24,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 13:47:24,562] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 13:47:24,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 13:47:24,598] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 13:47:24,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 13:47:24,713] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 13:47:24,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 13:47:24,657] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 13:47:24,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 13:47:24,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 13:47:24,700] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 13:47:24,703] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 13:47:24,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 13:47:24,727] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 13:47:24,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 13:47:24,701] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 13:47:24,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 13:47:24,746] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 13:47:24,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 13:47:24,760] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 13:47:24,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 13:47:24,797] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 13:47:24,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 13:47:24,802] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default7]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default0]:[2022-10-06 13:47:24,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 13:47:24,904] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default5]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default7]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default3]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default1]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default3]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default6]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default0]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default0]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default0]: successfully saved checkpoint at iteration 500 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default7]:[2022-10-06 13:47:24,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 13:47:24,869] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default7]:time (ms) | save-checkpoint: 2137.01 +[default3]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default6]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default1]:[2022-10-06 13:47:24,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 13:47:24,857] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step500/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default2]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default0]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default2]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default2]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default7]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default6]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default2]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default1]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default5]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default4]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default4]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default4]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default3]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default1]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default4]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default6]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default5]:[2022-10-06 13:47:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step500 is ready now! +[default7]: iteration 501/ 6200 | consumed samples: 513024 | consumed tokens: 1050673152 | elapsed time per iteration (s): 54.22 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.142695E+00 | loss scale: 512.0 | grad norm: 7.965 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 18.887 | TFLOPs: 5.76 | +[default7]: iteration 502/ 6200 | consumed samples: 514048 | consumed tokens: 1052770304 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.165023E+00 | loss scale: 512.0 | grad norm: 6.005 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.969 | TFLOPs: 41.75 | +[default7]: iteration 503/ 6200 | consumed samples: 515072 | consumed tokens: 1054867456 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.152631E+00 | loss scale: 512.0 | grad norm: 6.787 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.701 | TFLOPs: 41.67 | +[default7]: iteration 504/ 6200 | consumed samples: 516096 | consumed tokens: 1056964608 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.119533E+00 | loss scale: 1024.0 | grad norm: 3.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.382 | TFLOPs: 41.88 | +[default7]: iteration 505/ 6200 | consumed samples: 517120 | consumed tokens: 1059061760 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.175550E+00 | loss scale: 1024.0 | grad norm: 5.713 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.667 | TFLOPs: 41.66 | +[default7]: iteration 506/ 6200 | consumed samples: 518144 | consumed tokens: 1061158912 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.135023E+00 | loss scale: 1024.0 | grad norm: 7.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.713 | TFLOPs: 41.98 | +[default7]: iteration 507/ 6200 | consumed samples: 519168 | consumed tokens: 1063256064 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.133095E+00 | loss scale: 1024.0 | grad norm: 6.901 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.397 | TFLOPs: 41.88 | +[default7]: iteration 508/ 6200 | consumed samples: 520192 | consumed tokens: 1065353216 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.142659E+00 | loss scale: 1024.0 | grad norm: 8.028 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.469 | TFLOPs: 41.90 | +[default7]: iteration 509/ 6200 | consumed samples: 521216 | consumed tokens: 1067450368 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.146896E+00 | loss scale: 1024.0 | grad norm: 7.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.192 | TFLOPs: 41.82 | +[default7]: iteration 510/ 6200 | consumed samples: 522240 | consumed tokens: 1069547520 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.135437E+00 | loss scale: 1024.0 | grad norm: 8.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.287 | TFLOPs: 41.85 | +[default7]: iteration 511/ 6200 | consumed samples: 523264 | consumed tokens: 1071644672 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.108875E+00 | loss scale: 1024.0 | grad norm: 8.802 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.323 | TFLOPs: 41.86 | +[default7]: iteration 512/ 6200 | consumed samples: 524288 | consumed tokens: 1073741824 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.129268E+00 | loss scale: 1024.0 | grad norm: 7.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.180 | TFLOPs: 42.12 | +[default7]: iteration 513/ 6200 | consumed samples: 525312 | consumed tokens: 1075838976 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.166787E+00 | loss scale: 1024.0 | grad norm: 6.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.942 | TFLOPs: 42.05 | +[default7]: iteration 514/ 6200 | consumed samples: 526336 | consumed tokens: 1077936128 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.134938E+00 | loss scale: 1024.0 | grad norm: 6.878 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.407 | TFLOPs: 41.88 | +[default7]: iteration 515/ 6200 | consumed samples: 527360 | consumed tokens: 1080033280 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.143279E+00 | loss scale: 1024.0 | grad norm: 8.922 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.350 | TFLOPs: 41.87 | +[default7]: iteration 516/ 6200 | consumed samples: 528384 | consumed tokens: 1082130432 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.116441E+00 | loss scale: 1024.0 | grad norm: 6.937 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.598 | TFLOPs: 41.94 | +[default7]: iteration 517/ 6200 | consumed samples: 529408 | consumed tokens: 1084227584 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.130737E+00 | loss scale: 1024.0 | grad norm: 9.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.714 | TFLOPs: 41.98 | +[default7]: iteration 518/ 6200 | consumed samples: 530432 | consumed tokens: 1086324736 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.148597E+00 | loss scale: 1024.0 | grad norm: 9.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.705 | TFLOPs: 41.97 | +[default7]: iteration 519/ 6200 | consumed samples: 531456 | consumed tokens: 1088421888 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.147597E+00 | loss scale: 1024.0 | grad norm: 7.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.051 | TFLOPs: 41.77 | +[default7]: iteration 520/ 6200 | consumed samples: 532480 | consumed tokens: 1090519040 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.093552E+00 | loss scale: 1024.0 | grad norm: 7.940 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.126 | TFLOPs: 41.80 | +[default7]: iteration 521/ 6200 | consumed samples: 533504 | consumed tokens: 1092616192 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.115924E+00 | loss scale: 1024.0 | grad norm: 9.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.297 | TFLOPs: 41.85 | +[default7]: iteration 522/ 6200 | consumed samples: 534528 | consumed tokens: 1094713344 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.148550E+00 | loss scale: 1024.0 | grad norm: 6.633 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.201 | TFLOPs: 41.82 | +[default7]: iteration 523/ 6200 | consumed samples: 535552 | consumed tokens: 1096810496 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.119197E+00 | loss scale: 1024.0 | grad norm: 7.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.237 | TFLOPs: 41.83 | +[default7]: iteration 524/ 6200 | consumed samples: 536576 | consumed tokens: 1098907648 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.146945E+00 | loss scale: 1024.0 | grad norm: 7.906 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.401 | TFLOPs: 41.88 | +[default7]: iteration 525/ 6200 | consumed samples: 537600 | consumed tokens: 1101004800 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.144628E+00 | loss scale: 1024.0 | grad norm: 7.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.216 | TFLOPs: 42.13 | +[default7]: iteration 526/ 6200 | consumed samples: 538624 | consumed tokens: 1103101952 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.144933E+00 | loss scale: 1024.0 | grad norm: 10.683 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.058 | TFLOPs: 42.08 | +[default7]: iteration 527/ 6200 | consumed samples: 539648 | consumed tokens: 1105199104 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.125473E+00 | loss scale: 1024.0 | grad norm: 8.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.652 | TFLOPs: 41.96 | +[default7]: iteration 528/ 6200 | consumed samples: 540672 | consumed tokens: 1107296256 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.125587E+00 | loss scale: 1024.0 | grad norm: 6.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.762 | TFLOPs: 41.99 | +[default7]: iteration 529/ 6200 | consumed samples: 541696 | consumed tokens: 1109393408 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.120049E+00 | loss scale: 1024.0 | grad norm: 7.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.240 | TFLOPs: 41.83 | +[default7]: iteration 530/ 6200 | consumed samples: 542720 | consumed tokens: 1111490560 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.131431E+00 | loss scale: 1024.0 | grad norm: 7.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.782 | TFLOPs: 42.00 | +[default7]: iteration 531/ 6200 | consumed samples: 543744 | consumed tokens: 1113587712 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.122583E+00 | loss scale: 1024.0 | grad norm: 7.671 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.268 | TFLOPs: 41.84 | +[default7]: iteration 532/ 6200 | consumed samples: 544768 | consumed tokens: 1115684864 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.131818E+00 | loss scale: 1024.0 | grad norm: 6.870 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.244 | TFLOPs: 41.83 | +[default7]: iteration 533/ 6200 | consumed samples: 545792 | consumed tokens: 1117782016 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.141186E+00 | loss scale: 1024.0 | grad norm: 6.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.236 | TFLOPs: 41.83 | +[default7]: iteration 534/ 6200 | consumed samples: 546816 | consumed tokens: 1119879168 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.099131E+00 | loss scale: 1024.0 | grad norm: 7.655 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.018 | TFLOPs: 41.76 | +[default7]: iteration 535/ 6200 | consumed samples: 547840 | consumed tokens: 1121976320 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.142438E+00 | loss scale: 1024.0 | grad norm: 6.807 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.132 | TFLOPs: 41.80 | +[default7]: iteration 536/ 6200 | consumed samples: 548864 | consumed tokens: 1124073472 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.115973E+00 | loss scale: 1024.0 | grad norm: 6.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.941 | TFLOPs: 41.74 | +[default7]: iteration 537/ 6200 | consumed samples: 549888 | consumed tokens: 1126170624 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.151537E+00 | loss scale: 1024.0 | grad norm: 7.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.409 | TFLOPs: 41.88 | +[default7]: iteration 538/ 6200 | consumed samples: 550912 | consumed tokens: 1128267776 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.097895E+00 | loss scale: 1024.0 | grad norm: 8.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.606 | TFLOPs: 41.94 | +[default7]: iteration 539/ 6200 | consumed samples: 551936 | consumed tokens: 1130364928 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.130151E+00 | loss scale: 1024.0 | grad norm: 7.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.787 | TFLOPs: 42.00 | +[default7]: iteration 540/ 6200 | consumed samples: 552960 | consumed tokens: 1132462080 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.147462E+00 | loss scale: 1024.0 | grad norm: 6.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.298 | TFLOPs: 41.85 | +[default7]: iteration 541/ 6200 | consumed samples: 553984 | consumed tokens: 1134559232 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.115110E+00 | loss scale: 1024.0 | grad norm: 9.614 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.195 | TFLOPs: 41.82 | +[default7]: iteration 542/ 6200 | consumed samples: 555008 | consumed tokens: 1136656384 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.114839E+00 | loss scale: 1024.0 | grad norm: 7.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.103 | TFLOPs: 41.79 | +[default7]: iteration 543/ 6200 | consumed samples: 556032 | consumed tokens: 1138753536 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.147350E+00 | loss scale: 1024.0 | grad norm: 7.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.757 | TFLOPs: 41.99 | +[default7]: iteration 544/ 6200 | consumed samples: 557056 | consumed tokens: 1140850688 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.122765E+00 | loss scale: 1024.0 | grad norm: 11.020 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.191 | TFLOPs: 41.82 | +[default7]: iteration 545/ 6200 | consumed samples: 558080 | consumed tokens: 1142947840 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.149161E+00 | loss scale: 1024.0 | grad norm: 7.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.491 | TFLOPs: 41.91 | +[default7]: iteration 546/ 6200 | consumed samples: 559104 | consumed tokens: 1145044992 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.120883E+00 | loss scale: 1024.0 | grad norm: 6.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.385 | TFLOPs: 41.88 | +[default7]: iteration 547/ 6200 | consumed samples: 560128 | consumed tokens: 1147142144 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.144075E+00 | loss scale: 1024.0 | grad norm: 9.748 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.330 | TFLOPs: 41.86 | +[default7]: iteration 548/ 6200 | consumed samples: 561152 | consumed tokens: 1149239296 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.128562E+00 | loss scale: 1024.0 | grad norm: 11.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.252 | TFLOPs: 41.84 | +[default7]: iteration 549/ 6200 | consumed samples: 562176 | consumed tokens: 1151336448 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.139233E+00 | loss scale: 1024.0 | grad norm: 7.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.276 | TFLOPs: 41.84 | +[default7]: iteration 550/ 6200 | consumed samples: 563200 | consumed tokens: 1153433600 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.136922E+00 | loss scale: 1024.0 | grad norm: 6.004 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.399 | TFLOPs: 41.88 | +[default7]: iteration 551/ 6200 | consumed samples: 564224 | consumed tokens: 1155530752 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.139159E+00 | loss scale: 1024.0 | grad norm: 11.765 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.503 | TFLOPs: 41.91 | +[default7]: iteration 552/ 6200 | consumed samples: 565248 | consumed tokens: 1157627904 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.107892E+00 | loss scale: 1024.0 | grad norm: 12.724 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.123 | TFLOPs: 41.80 | +[default7]: iteration 553/ 6200 | consumed samples: 566272 | consumed tokens: 1159725056 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.153206E+00 | loss scale: 1024.0 | grad norm: 7.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.285 | TFLOPs: 41.85 | +[default7]: iteration 554/ 6200 | consumed samples: 567296 | consumed tokens: 1161822208 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.105306E+00 | loss scale: 1024.0 | grad norm: 6.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.912 | TFLOPs: 42.04 | +[default7]: iteration 555/ 6200 | consumed samples: 568320 | consumed tokens: 1163919360 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.122540E+00 | loss scale: 1024.0 | grad norm: 7.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.345 | TFLOPs: 41.86 | +[default7]: iteration 556/ 6200 | consumed samples: 569344 | consumed tokens: 1166016512 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.136348E+00 | loss scale: 1024.0 | grad norm: 8.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.541 | TFLOPs: 41.92 | +[default7]: iteration 557/ 6200 | consumed samples: 570368 | consumed tokens: 1168113664 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.120302E+00 | loss scale: 1024.0 | grad norm: 7.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.288 | TFLOPs: 41.85 | +[default7]: iteration 558/ 6200 | consumed samples: 571392 | consumed tokens: 1170210816 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.168553E+00 | loss scale: 1024.0 | grad norm: 6.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.632 | TFLOPs: 41.95 | +[default7]: iteration 559/ 6200 | consumed samples: 572416 | consumed tokens: 1172307968 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.089411E+00 | loss scale: 1024.0 | grad norm: 7.740 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.253 | TFLOPs: 41.84 | +[default7]: iteration 560/ 6200 | consumed samples: 573440 | consumed tokens: 1174405120 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.090869E+00 | loss scale: 1024.0 | grad norm: 8.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.460 | TFLOPs: 41.90 | +[default7]: iteration 561/ 6200 | consumed samples: 574464 | consumed tokens: 1176502272 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.130466E+00 | loss scale: 1024.0 | grad norm: 6.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.494 | TFLOPs: 41.91 | +[default7]: iteration 562/ 6200 | consumed samples: 575488 | consumed tokens: 1178599424 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.115896E+00 | loss scale: 1024.0 | grad norm: 7.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.302 | TFLOPs: 42.16 | +[default7]: iteration 563/ 6200 | consumed samples: 576512 | consumed tokens: 1180696576 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.110658E+00 | loss scale: 1024.0 | grad norm: 6.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.099 | TFLOPs: 42.09 | +[default7]: iteration 564/ 6200 | consumed samples: 577536 | consumed tokens: 1182793728 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.097682E+00 | loss scale: 1024.0 | grad norm: 7.034 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.852 | TFLOPs: 42.02 | +[default7]: iteration 565/ 6200 | consumed samples: 578560 | consumed tokens: 1184890880 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.091748E+00 | loss scale: 1024.0 | grad norm: 6.793 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.571 | TFLOPs: 41.93 | +[default7]: iteration 566/ 6200 | consumed samples: 579584 | consumed tokens: 1186988032 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.121800E+00 | loss scale: 1024.0 | grad norm: 7.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.723 | TFLOPs: 41.98 | +[default7]: iteration 567/ 6200 | consumed samples: 580608 | consumed tokens: 1189085184 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.089960E+00 | loss scale: 1024.0 | grad norm: 6.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.362 | TFLOPs: 41.87 | +[default7]: iteration 568/ 6200 | consumed samples: 581632 | consumed tokens: 1191182336 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.115696E+00 | loss scale: 1024.0 | grad norm: 7.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.341 | TFLOPs: 41.86 | +[default7]: iteration 569/ 6200 | consumed samples: 582656 | consumed tokens: 1193279488 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.118412E+00 | loss scale: 1024.0 | grad norm: 6.984 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.741 | TFLOPs: 41.98 | +[default7]: iteration 570/ 6200 | consumed samples: 583680 | consumed tokens: 1195376640 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.111142E+00 | loss scale: 1024.0 | grad norm: 8.722 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.467 | TFLOPs: 41.90 | +[default7]: iteration 571/ 6200 | consumed samples: 584704 | consumed tokens: 1197473792 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.138117E+00 | loss scale: 1024.0 | grad norm: 6.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.909 | TFLOPs: 42.04 | +[default7]: iteration 572/ 6200 | consumed samples: 585728 | consumed tokens: 1199570944 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.107264E+00 | loss scale: 1024.0 | grad norm: 5.668 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.348 | TFLOPs: 42.17 | +[default7]: iteration 573/ 6200 | consumed samples: 586752 | consumed tokens: 1201668096 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.121762E+00 | loss scale: 1024.0 | grad norm: 6.886 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.488 | TFLOPs: 42.21 | +[default7]: iteration 574/ 6200 | consumed samples: 587776 | consumed tokens: 1203765248 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.116478E+00 | loss scale: 1024.0 | grad norm: 6.924 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.260 | TFLOPs: 42.14 | +[default7]: iteration 575/ 6200 | consumed samples: 588800 | consumed tokens: 1205862400 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.133538E+00 | loss scale: 1024.0 | grad norm: 8.028 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.880 | TFLOPs: 42.03 | +[default7]: iteration 576/ 6200 | consumed samples: 589824 | consumed tokens: 1207959552 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.109363E+00 | loss scale: 1024.0 | grad norm: 6.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.198 | TFLOPs: 42.12 | +[default7]: iteration 577/ 6200 | consumed samples: 590848 | consumed tokens: 1210056704 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.106413E+00 | loss scale: 1024.0 | grad norm: 6.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.747 | TFLOPs: 41.99 | +[default7]: iteration 578/ 6200 | consumed samples: 591872 | consumed tokens: 1212153856 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.111518E+00 | loss scale: 1024.0 | grad norm: 5.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.231 | TFLOPs: 42.13 | +[default7]: iteration 579/ 6200 | consumed samples: 592896 | consumed tokens: 1214251008 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.098159E+00 | loss scale: 1024.0 | grad norm: 6.877 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.781 | TFLOPs: 42.00 | +[default7]: iteration 580/ 6200 | consumed samples: 593920 | consumed tokens: 1216348160 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.120652E+00 | loss scale: 1024.0 | grad norm: 6.039 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.219 | TFLOPs: 42.13 | +[default7]: iteration 581/ 6200 | consumed samples: 594944 | consumed tokens: 1218445312 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.104921E+00 | loss scale: 1024.0 | grad norm: 6.944 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.247 | TFLOPs: 42.14 | +[default7]: iteration 582/ 6200 | consumed samples: 595968 | consumed tokens: 1220542464 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.105673E+00 | loss scale: 1024.0 | grad norm: 6.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.216 | TFLOPs: 42.13 | +[default7]: iteration 583/ 6200 | consumed samples: 596992 | consumed tokens: 1222639616 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.108050E+00 | loss scale: 1024.0 | grad norm: 6.675 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.118 | TFLOPs: 42.10 | +[default7]: iteration 584/ 6200 | consumed samples: 598016 | consumed tokens: 1224736768 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.120055E+00 | loss scale: 1024.0 | grad norm: 7.618 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.462 | TFLOPs: 42.20 | +[default7]: iteration 585/ 6200 | consumed samples: 599040 | consumed tokens: 1226833920 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.094174E+00 | loss scale: 1024.0 | grad norm: 6.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.443 | TFLOPs: 42.20 | +[default7]: iteration 586/ 6200 | consumed samples: 600064 | consumed tokens: 1228931072 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.097566E+00 | loss scale: 1024.0 | grad norm: 6.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.095 | TFLOPs: 42.09 | +[default7]: iteration 587/ 6200 | consumed samples: 601088 | consumed tokens: 1231028224 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.101797E+00 | loss scale: 1024.0 | grad norm: 6.765 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.034 | TFLOPs: 41.77 | +[default7]: iteration 588/ 6200 | consumed samples: 602112 | consumed tokens: 1233125376 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.142553E+00 | loss scale: 1024.0 | grad norm: 5.967 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.477 | TFLOPs: 41.90 | +[default7]: iteration 589/ 6200 | consumed samples: 603136 | consumed tokens: 1235222528 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.130052E+00 | loss scale: 1024.0 | grad norm: 5.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.868 | TFLOPs: 42.02 | +[default7]: iteration 590/ 6200 | consumed samples: 604160 | consumed tokens: 1237319680 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.096811E+00 | loss scale: 1024.0 | grad norm: 6.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.264 | TFLOPs: 42.14 | +[default7]: iteration 591/ 6200 | consumed samples: 605184 | consumed tokens: 1239416832 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.099070E+00 | loss scale: 1024.0 | grad norm: 6.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.687 | TFLOPs: 41.97 | +[default7]: iteration 592/ 6200 | consumed samples: 606208 | consumed tokens: 1241513984 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.120278E+00 | loss scale: 1024.0 | grad norm: 8.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.176 | TFLOPs: 42.12 | +[default7]: iteration 593/ 6200 | consumed samples: 607232 | consumed tokens: 1243611136 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.087181E+00 | loss scale: 1024.0 | grad norm: 6.975 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.066 | TFLOPs: 42.08 | +[default7]: iteration 594/ 6200 | consumed samples: 608256 | consumed tokens: 1245708288 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.110761E+00 | loss scale: 1024.0 | grad norm: 6.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.025 | TFLOPs: 42.07 | +[default7]: iteration 595/ 6200 | consumed samples: 609280 | consumed tokens: 1247805440 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.099960E+00 | loss scale: 1024.0 | grad norm: 6.069 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.530 | TFLOPs: 41.92 | +[default7]: iteration 596/ 6200 | consumed samples: 610304 | consumed tokens: 1249902592 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.087389E+00 | loss scale: 1024.0 | grad norm: 8.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.461 | TFLOPs: 41.90 | +[default7]: iteration 597/ 6200 | consumed samples: 611328 | consumed tokens: 1251999744 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.096539E+00 | loss scale: 1024.0 | grad norm: 5.720 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.889 | TFLOPs: 42.03 | +[default7]: iteration 598/ 6200 | consumed samples: 612352 | consumed tokens: 1254096896 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.124824E+00 | loss scale: 1024.0 | grad norm: 5.983 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.900 | TFLOPs: 42.03 | +[default7]: iteration 599/ 6200 | consumed samples: 613376 | consumed tokens: 1256194048 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.100561E+00 | loss scale: 1024.0 | grad norm: 6.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.061 | TFLOPs: 42.08 | +[default7]: iteration 600/ 6200 | consumed samples: 614400 | consumed tokens: 1258291200 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.132365E+00 | loss scale: 1024.0 | grad norm: 6.629 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.331 | TFLOPs: 41.86 | +[default7]: iteration 601/ 6200 | consumed samples: 615424 | consumed tokens: 1260388352 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.112117E+00 | loss scale: 1024.0 | grad norm: 10.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.452 | TFLOPs: 41.90 | +[default7]: iteration 602/ 6200 | consumed samples: 616448 | consumed tokens: 1262485504 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.115198E+00 | loss scale: 1024.0 | grad norm: 9.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.147 | TFLOPs: 41.80 | +[default7]: iteration 603/ 6200 | consumed samples: 617472 | consumed tokens: 1264582656 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.109359E+00 | loss scale: 1024.0 | grad norm: 6.713 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.341 | TFLOPs: 41.86 | +[default7]: iteration 604/ 6200 | consumed samples: 618496 | consumed tokens: 1266679808 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.115981E+00 | loss scale: 1024.0 | grad norm: 10.951 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.255 | TFLOPs: 41.84 | +[default7]: iteration 605/ 6200 | consumed samples: 619520 | consumed tokens: 1268776960 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.106130E+00 | loss scale: 1024.0 | grad norm: 14.776 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.978 | TFLOPs: 42.06 | +[default7]: iteration 606/ 6200 | consumed samples: 620544 | consumed tokens: 1270874112 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.114234E+00 | loss scale: 1024.0 | grad norm: 8.804 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.411 | TFLOPs: 41.88 | +[default7]: iteration 607/ 6200 | consumed samples: 621568 | consumed tokens: 1272971264 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.141692E+00 | loss scale: 1024.0 | grad norm: 7.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.018 | TFLOPs: 41.76 | +[default7]: iteration 608/ 6200 | consumed samples: 622592 | consumed tokens: 1275068416 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.111139E+00 | loss scale: 1024.0 | grad norm: 11.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.668 | TFLOPs: 41.66 | +[default7]: iteration 609/ 6200 | consumed samples: 623616 | consumed tokens: 1277165568 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.138804E+00 | loss scale: 1024.0 | grad norm: 9.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.861 | TFLOPs: 41.72 | +[default7]: iteration 610/ 6200 | consumed samples: 624640 | consumed tokens: 1279262720 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.118035E+00 | loss scale: 1024.0 | grad norm: 7.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.463 | TFLOPs: 41.90 | +[default7]: iteration 611/ 6200 | consumed samples: 625664 | consumed tokens: 1281359872 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.099530E+00 | loss scale: 1024.0 | grad norm: 9.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.493 | TFLOPs: 41.91 | +[default7]: iteration 612/ 6200 | consumed samples: 626688 | consumed tokens: 1283457024 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.080122E+00 | loss scale: 1024.0 | grad norm: 9.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.713 | TFLOPs: 41.98 | +[default7]: iteration 613/ 6200 | consumed samples: 627712 | consumed tokens: 1285554176 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.107356E+00 | loss scale: 1024.0 | grad norm: 6.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.659 | TFLOPs: 41.96 | +[default7]: iteration 614/ 6200 | consumed samples: 628736 | consumed tokens: 1287651328 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.108040E+00 | loss scale: 1024.0 | grad norm: 6.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.258 | TFLOPs: 41.84 | +[default7]: iteration 615/ 6200 | consumed samples: 629760 | consumed tokens: 1289748480 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.108127E+00 | loss scale: 1024.0 | grad norm: 7.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.923 | TFLOPs: 41.74 | +[default7]: iteration 616/ 6200 | consumed samples: 630784 | consumed tokens: 1291845632 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.092990E+00 | loss scale: 1024.0 | grad norm: 5.771 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.234 | TFLOPs: 41.83 | +[default7]: iteration 617/ 6200 | consumed samples: 631808 | consumed tokens: 1293942784 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.085627E+00 | loss scale: 1024.0 | grad norm: 5.902 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.785 | TFLOPs: 42.00 | +[default7]: iteration 618/ 6200 | consumed samples: 632832 | consumed tokens: 1296039936 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.100055E+00 | loss scale: 1024.0 | grad norm: 6.681 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.749 | TFLOPs: 41.99 | +[default7]: iteration 619/ 6200 | consumed samples: 633856 | consumed tokens: 1298137088 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.082906E+00 | loss scale: 1024.0 | grad norm: 5.958 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.237 | TFLOPs: 41.83 | +[default7]: iteration 620/ 6200 | consumed samples: 634880 | consumed tokens: 1300234240 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.103170E+00 | loss scale: 1024.0 | grad norm: 6.065 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.328 | TFLOPs: 41.86 | +[default7]: iteration 621/ 6200 | consumed samples: 635904 | consumed tokens: 1302331392 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.085985E+00 | loss scale: 1024.0 | grad norm: 7.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.351 | TFLOPs: 41.87 | +[default7]: iteration 622/ 6200 | consumed samples: 636928 | consumed tokens: 1304428544 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.089495E+00 | loss scale: 1024.0 | grad norm: 7.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.339 | TFLOPs: 41.86 | +[default7]: iteration 623/ 6200 | consumed samples: 637952 | consumed tokens: 1306525696 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.073705E+00 | loss scale: 1024.0 | grad norm: 6.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.280 | TFLOPs: 41.84 | +[default7]: iteration 624/ 6200 | consumed samples: 638976 | consumed tokens: 1308622848 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.102024E+00 | loss scale: 1024.0 | grad norm: 8.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.285 | TFLOPs: 41.85 | +[default7]: iteration 625/ 6200 | consumed samples: 640000 | consumed tokens: 1310720000 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.101371E+00 | loss scale: 1024.0 | grad norm: 8.832 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.660 | TFLOPs: 41.96 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 625 | lm loss value: 3.384842E+00 | lm loss PPL: 2.951333E+01 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default7]:---------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 625 | lm loss value: 2.006690E+00 | lm loss PPL: 7.438655E+00 | +[default7]:---------------------------------------------------------------------------------------------- +[default7]: iteration 626/ 6200 | consumed samples: 641024 | consumed tokens: 1312817152 | elapsed time per iteration (s): 53.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.100748E+00 | loss scale: 1024.0 | grad norm: 7.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.197 | TFLOPs: 5.85 | +[default7]: iteration 627/ 6200 | consumed samples: 642048 | consumed tokens: 1314914304 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.121236E+00 | loss scale: 1024.0 | grad norm: 5.861 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.506 | TFLOPs: 41.61 | +[default7]: iteration 628/ 6200 | consumed samples: 643072 | consumed tokens: 1317011456 | elapsed time per iteration (s): 7.51 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.089739E+00 | loss scale: 1024.0 | grad norm: 8.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.320 | TFLOPs: 41.55 | +[default7]: iteration 629/ 6200 | consumed samples: 644096 | consumed tokens: 1319108608 | elapsed time per iteration (s): 7.52 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.083596E+00 | loss scale: 1024.0 | grad norm: 6.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.252 | TFLOPs: 41.53 | +[default7]: iteration 630/ 6200 | consumed samples: 645120 | consumed tokens: 1321205760 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.116551E+00 | loss scale: 1024.0 | grad norm: 5.882 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.596 | TFLOPs: 41.64 | +[default7]: iteration 631/ 6200 | consumed samples: 646144 | consumed tokens: 1323302912 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.066382E+00 | loss scale: 1024.0 | grad norm: 6.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.425 | TFLOPs: 41.89 | +[default7]: iteration 632/ 6200 | consumed samples: 647168 | consumed tokens: 1325400064 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.103524E+00 | loss scale: 1024.0 | grad norm: 6.602 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.775 | TFLOPs: 41.99 | +[default7]: iteration 633/ 6200 | consumed samples: 648192 | consumed tokens: 1327497216 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.112973E+00 | loss scale: 1024.0 | grad norm: 5.940 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.626 | TFLOPs: 41.95 | +[default7]: iteration 634/ 6200 | consumed samples: 649216 | consumed tokens: 1329594368 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.087316E+00 | loss scale: 1024.0 | grad norm: 6.042 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.006 | TFLOPs: 42.07 | +[default7]: iteration 635/ 6200 | consumed samples: 650240 | consumed tokens: 1331691520 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.081865E+00 | loss scale: 1024.0 | grad norm: 7.781 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.090 | TFLOPs: 42.09 | +[default7]: iteration 636/ 6200 | consumed samples: 651264 | consumed tokens: 1333788672 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.069978E+00 | loss scale: 1024.0 | grad norm: 6.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.293 | TFLOPs: 42.15 | +[default7]: iteration 637/ 6200 | consumed samples: 652288 | consumed tokens: 1335885824 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.065311E+00 | loss scale: 1024.0 | grad norm: 6.079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.467 | TFLOPs: 41.90 | +[default7]: iteration 638/ 6200 | consumed samples: 653312 | consumed tokens: 1337982976 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.118611E+00 | loss scale: 1024.0 | grad norm: 6.942 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.016 | TFLOPs: 42.07 | +[default7]: iteration 639/ 6200 | consumed samples: 654336 | consumed tokens: 1340080128 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.104863E+00 | loss scale: 1024.0 | grad norm: 5.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.818 | TFLOPs: 42.01 | +[default7]: iteration 640/ 6200 | consumed samples: 655360 | consumed tokens: 1342177280 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.066296E+00 | loss scale: 1024.0 | grad norm: 6.631 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.234 | TFLOPs: 42.13 | +[default7]: iteration 641/ 6200 | consumed samples: 656384 | consumed tokens: 1344274432 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.095726E+00 | loss scale: 1024.0 | grad norm: 6.874 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.899 | TFLOPs: 42.03 | +[default7]: iteration 642/ 6200 | consumed samples: 657408 | consumed tokens: 1346371584 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.104774E+00 | loss scale: 1024.0 | grad norm: 8.779 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.968 | TFLOPs: 42.05 | +[default7]: iteration 643/ 6200 | consumed samples: 658432 | consumed tokens: 1348468736 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.069025E+00 | loss scale: 1024.0 | grad norm: 8.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.795 | TFLOPs: 42.00 | +[default7]: iteration 644/ 6200 | consumed samples: 659456 | consumed tokens: 1350565888 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.057475E+00 | loss scale: 1024.0 | grad norm: 5.983 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.569 | TFLOPs: 41.93 | +[default7]: iteration 645/ 6200 | consumed samples: 660480 | consumed tokens: 1352663040 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.081385E+00 | loss scale: 1024.0 | grad norm: 11.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.523 | TFLOPs: 41.92 | +[default7]: iteration 646/ 6200 | consumed samples: 661504 | consumed tokens: 1354760192 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.091856E+00 | loss scale: 1024.0 | grad norm: 8.879 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.875 | TFLOPs: 42.03 | +[default7]: iteration 647/ 6200 | consumed samples: 662528 | consumed tokens: 1356857344 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.101812E+00 | loss scale: 1024.0 | grad norm: 5.960 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.725 | TFLOPs: 41.98 | +[default7]: iteration 648/ 6200 | consumed samples: 663552 | consumed tokens: 1358954496 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.086768E+00 | loss scale: 1024.0 | grad norm: 7.805 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.884 | TFLOPs: 42.03 | +[default7]: iteration 649/ 6200 | consumed samples: 664576 | consumed tokens: 1361051648 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.088110E+00 | loss scale: 1024.0 | grad norm: 9.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.244 | TFLOPs: 42.14 | +[default7]: iteration 650/ 6200 | consumed samples: 665600 | consumed tokens: 1363148800 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.085779E+00 | loss scale: 1024.0 | grad norm: 5.830 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.381 | TFLOPs: 41.87 | +[default7]: iteration 651/ 6200 | consumed samples: 666624 | consumed tokens: 1365245952 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.094911E+00 | loss scale: 1024.0 | grad norm: 5.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.978 | TFLOPs: 42.36 | +[default7]: iteration 652/ 6200 | consumed samples: 667648 | consumed tokens: 1367343104 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.105126E+00 | loss scale: 1024.0 | grad norm: 10.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.158 | TFLOPs: 41.81 | +[default7]: iteration 653/ 6200 | consumed samples: 668672 | consumed tokens: 1369440256 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.102912E+00 | loss scale: 1024.0 | grad norm: 8.919 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.605 | TFLOPs: 42.25 | +[default7]: iteration 654/ 6200 | consumed samples: 669696 | consumed tokens: 1371537408 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.106535E+00 | loss scale: 1024.0 | grad norm: 6.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.100 | TFLOPs: 42.09 | +[default7]: iteration 655/ 6200 | consumed samples: 670720 | consumed tokens: 1373634560 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.092876E+00 | loss scale: 1024.0 | grad norm: 9.904 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.631 | TFLOPs: 41.95 | +[default7]: iteration 656/ 6200 | consumed samples: 671744 | consumed tokens: 1375731712 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.085565E+00 | loss scale: 1024.0 | grad norm: 12.769 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.819 | TFLOPs: 42.01 | +[default7]: iteration 657/ 6200 | consumed samples: 672768 | consumed tokens: 1377828864 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.072741E+00 | loss scale: 1024.0 | grad norm: 8.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.231 | TFLOPs: 42.13 | +[default7]: iteration 658/ 6200 | consumed samples: 673792 | consumed tokens: 1379926016 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.081690E+00 | loss scale: 1024.0 | grad norm: 6.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.433 | TFLOPs: 42.20 | +[default7]: iteration 659/ 6200 | consumed samples: 674816 | consumed tokens: 1382023168 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.098101E+00 | loss scale: 1024.0 | grad norm: 8.716 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.116 | TFLOPs: 42.10 | +[default7]: iteration 660/ 6200 | consumed samples: 675840 | consumed tokens: 1384120320 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.074945E+00 | loss scale: 1024.0 | grad norm: 7.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.011 | TFLOPs: 42.07 | +[default7]: iteration 661/ 6200 | consumed samples: 676864 | consumed tokens: 1386217472 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.086445E+00 | loss scale: 1024.0 | grad norm: 5.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.122 | TFLOPs: 42.10 | +[default7]: iteration 662/ 6200 | consumed samples: 677888 | consumed tokens: 1388314624 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.087425E+00 | loss scale: 1024.0 | grad norm: 6.927 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.579 | TFLOPs: 41.94 | +[default7]: iteration 663/ 6200 | consumed samples: 678912 | consumed tokens: 1390411776 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.042099E+00 | loss scale: 1024.0 | grad norm: 6.706 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.653 | TFLOPs: 42.26 | +[default7]: iteration 664/ 6200 | consumed samples: 679936 | consumed tokens: 1392508928 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.084859E+00 | loss scale: 1024.0 | grad norm: 6.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.729 | TFLOPs: 42.29 | +[default7]: iteration 665/ 6200 | consumed samples: 680960 | consumed tokens: 1394606080 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.063266E+00 | loss scale: 1024.0 | grad norm: 7.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.273 | TFLOPs: 41.84 | +[default7]: iteration 666/ 6200 | consumed samples: 681984 | consumed tokens: 1396703232 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.071255E+00 | loss scale: 1024.0 | grad norm: 6.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.140 | TFLOPs: 42.11 | +[default7]: iteration 667/ 6200 | consumed samples: 683008 | consumed tokens: 1398800384 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.094700E+00 | loss scale: 1024.0 | grad norm: 7.001 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.416 | TFLOPs: 42.19 | +[default7]: iteration 668/ 6200 | consumed samples: 684032 | consumed tokens: 1400897536 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.087430E+00 | loss scale: 1024.0 | grad norm: 5.732 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.274 | TFLOPs: 42.15 | +[default7]: iteration 669/ 6200 | consumed samples: 685056 | consumed tokens: 1402994688 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.077841E+00 | loss scale: 1024.0 | grad norm: 5.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.310 | TFLOPs: 42.16 | +[default7]: iteration 670/ 6200 | consumed samples: 686080 | consumed tokens: 1405091840 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.077294E+00 | loss scale: 1024.0 | grad norm: 7.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.039 | TFLOPs: 42.08 | +[default7]: iteration 671/ 6200 | consumed samples: 687104 | consumed tokens: 1407188992 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.083833E+00 | loss scale: 1024.0 | grad norm: 6.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.489 | TFLOPs: 42.21 | +[default7]: iteration 672/ 6200 | consumed samples: 688128 | consumed tokens: 1409286144 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.089034E+00 | loss scale: 1024.0 | grad norm: 6.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.936 | TFLOPs: 42.04 | +[default7]: iteration 673/ 6200 | consumed samples: 689152 | consumed tokens: 1411383296 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.092388E+00 | loss scale: 1024.0 | grad norm: 5.930 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.469 | TFLOPs: 42.21 | +[default7]: iteration 674/ 6200 | consumed samples: 690176 | consumed tokens: 1413480448 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.049237E+00 | loss scale: 1024.0 | grad norm: 7.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.305 | TFLOPs: 42.16 | +[default7]: iteration 675/ 6200 | consumed samples: 691200 | consumed tokens: 1415577600 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.095962E+00 | loss scale: 1024.0 | grad norm: 6.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.489 | TFLOPs: 42.21 | +[default7]: iteration 676/ 6200 | consumed samples: 692224 | consumed tokens: 1417674752 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.097162E+00 | loss scale: 1024.0 | grad norm: 6.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.074 | TFLOPs: 42.09 | +[default7]: iteration 677/ 6200 | consumed samples: 693248 | consumed tokens: 1419771904 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.085276E+00 | loss scale: 1024.0 | grad norm: 7.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.300 | TFLOPs: 42.16 | +[default7]: iteration 678/ 6200 | consumed samples: 694272 | consumed tokens: 1421869056 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.085377E+00 | loss scale: 1024.0 | grad norm: 7.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.431 | TFLOPs: 42.19 | +[default7]: iteration 679/ 6200 | consumed samples: 695296 | consumed tokens: 1423966208 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.091989E+00 | loss scale: 1024.0 | grad norm: 6.657 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.642 | TFLOPs: 41.95 | +[default7]: iteration 680/ 6200 | consumed samples: 696320 | consumed tokens: 1426063360 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.084699E+00 | loss scale: 1024.0 | grad norm: 7.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.170 | TFLOPs: 42.12 | +[default7]: iteration 681/ 6200 | consumed samples: 697344 | consumed tokens: 1428160512 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.108159E+00 | loss scale: 1024.0 | grad norm: 8.981 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.108 | TFLOPs: 42.10 | +[default7]: iteration 682/ 6200 | consumed samples: 698368 | consumed tokens: 1430257664 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.080415E+00 | loss scale: 1024.0 | grad norm: 6.732 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.135 | TFLOPs: 42.10 | +[default7]: iteration 683/ 6200 | consumed samples: 699392 | consumed tokens: 1432354816 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.074321E+00 | loss scale: 1024.0 | grad norm: 5.890 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.813 | TFLOPs: 42.01 | +[default7]: iteration 684/ 6200 | consumed samples: 700416 | consumed tokens: 1434451968 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.052513E+00 | loss scale: 1024.0 | grad norm: 5.898 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.071 | TFLOPs: 42.09 | +[default7]: iteration 685/ 6200 | consumed samples: 701440 | consumed tokens: 1436549120 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.060472E+00 | loss scale: 1024.0 | grad norm: 7.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.820 | TFLOPs: 42.01 | +[default7]: iteration 686/ 6200 | consumed samples: 702464 | consumed tokens: 1438646272 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.069421E+00 | loss scale: 1024.0 | grad norm: 6.750 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.593 | TFLOPs: 41.94 | +[default7]: iteration 687/ 6200 | consumed samples: 703488 | consumed tokens: 1440743424 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.084786E+00 | loss scale: 1024.0 | grad norm: 6.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.546 | TFLOPs: 41.93 | +[default7]: iteration 688/ 6200 | consumed samples: 704512 | consumed tokens: 1442840576 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.067118E+00 | loss scale: 1024.0 | grad norm: 6.805 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.676 | TFLOPs: 41.96 | +[default7]: iteration 689/ 6200 | consumed samples: 705536 | consumed tokens: 1444937728 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.096025E+00 | loss scale: 1024.0 | grad norm: 5.646 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.531 | TFLOPs: 41.92 | +[default7]: iteration 690/ 6200 | consumed samples: 706560 | consumed tokens: 1447034880 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.074728E+00 | loss scale: 1024.0 | grad norm: 6.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.919 | TFLOPs: 42.04 | +[default7]: iteration 691/ 6200 | consumed samples: 707584 | consumed tokens: 1449132032 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.082446E+00 | loss scale: 1024.0 | grad norm: 6.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.662 | TFLOPs: 41.66 | +[default7]: iteration 692/ 6200 | consumed samples: 708608 | consumed tokens: 1451229184 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.083560E+00 | loss scale: 1024.0 | grad norm: 5.758 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.548 | TFLOPs: 41.93 | +[default7]: iteration 693/ 6200 | consumed samples: 709632 | consumed tokens: 1453326336 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.074956E+00 | loss scale: 1024.0 | grad norm: 6.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.324 | TFLOPs: 41.86 | +[default7]: iteration 694/ 6200 | consumed samples: 710656 | consumed tokens: 1455423488 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.079283E+00 | loss scale: 1024.0 | grad norm: 6.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.124 | TFLOPs: 42.10 | +[default7]: iteration 695/ 6200 | consumed samples: 711680 | consumed tokens: 1457520640 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.074467E+00 | loss scale: 1024.0 | grad norm: 5.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.853 | TFLOPs: 42.02 | +[default7]: iteration 696/ 6200 | consumed samples: 712704 | consumed tokens: 1459617792 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.070500E+00 | loss scale: 1024.0 | grad norm: 5.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.685 | TFLOPs: 41.97 | +[default7]: iteration 697/ 6200 | consumed samples: 713728 | consumed tokens: 1461714944 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.072422E+00 | loss scale: 1024.0 | grad norm: 6.058 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.152 | TFLOPs: 42.11 | +[default7]: iteration 698/ 6200 | consumed samples: 714752 | consumed tokens: 1463812096 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.099458E+00 | loss scale: 1024.0 | grad norm: 5.969 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.529 | TFLOPs: 41.92 | +[default7]: iteration 699/ 6200 | consumed samples: 715776 | consumed tokens: 1465909248 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.071277E+00 | loss scale: 1024.0 | grad norm: 6.992 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.692 | TFLOPs: 41.97 | +[default7]: iteration 700/ 6200 | consumed samples: 716800 | consumed tokens: 1468006400 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.060969E+00 | loss scale: 1024.0 | grad norm: 7.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.881 | TFLOPs: 42.03 | +[default7]: iteration 701/ 6200 | consumed samples: 717824 | consumed tokens: 1470103552 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.069765E+00 | loss scale: 1024.0 | grad norm: 5.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.836 | TFLOPs: 42.01 | +[default7]: iteration 702/ 6200 | consumed samples: 718848 | consumed tokens: 1472200704 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.065793E+00 | loss scale: 1024.0 | grad norm: 5.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.775 | TFLOPs: 42.00 | +[default7]: iteration 703/ 6200 | consumed samples: 719872 | consumed tokens: 1474297856 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.058912E+00 | loss scale: 1024.0 | grad norm: 5.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.823 | TFLOPs: 42.01 | +[default7]: iteration 704/ 6200 | consumed samples: 720896 | consumed tokens: 1476395008 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.050357E+00 | loss scale: 1024.0 | grad norm: 6.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.432 | TFLOPs: 41.89 | +[default7]: iteration 705/ 6200 | consumed samples: 721920 | consumed tokens: 1478492160 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.072529E+00 | loss scale: 1024.0 | grad norm: 5.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.486 | TFLOPs: 41.91 | +[default7]: iteration 706/ 6200 | consumed samples: 722944 | consumed tokens: 1480589312 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.057459E+00 | loss scale: 1024.0 | grad norm: 5.741 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.214 | TFLOPs: 42.13 | +[default7]: iteration 707/ 6200 | consumed samples: 723968 | consumed tokens: 1482686464 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.076292E+00 | loss scale: 1024.0 | grad norm: 6.784 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.849 | TFLOPs: 42.02 | +[default7]: iteration 708/ 6200 | consumed samples: 724992 | consumed tokens: 1484783616 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.072187E+00 | loss scale: 1024.0 | grad norm: 6.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.024 | TFLOPs: 42.07 | +[default7]: iteration 709/ 6200 | consumed samples: 726016 | consumed tokens: 1486880768 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.066430E+00 | loss scale: 1024.0 | grad norm: 6.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.144 | TFLOPs: 42.11 | +[default7]: iteration 710/ 6200 | consumed samples: 727040 | consumed tokens: 1488977920 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.066794E+00 | loss scale: 1024.0 | grad norm: 6.689 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.665 | TFLOPs: 41.96 | +[default7]: iteration 711/ 6200 | consumed samples: 728064 | consumed tokens: 1491075072 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.080864E+00 | loss scale: 1024.0 | grad norm: 6.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.263 | TFLOPs: 42.14 | +[default7]: iteration 712/ 6200 | consumed samples: 729088 | consumed tokens: 1493172224 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.062725E+00 | loss scale: 1024.0 | grad norm: 5.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.347 | TFLOPs: 42.17 | +[default7]: iteration 713/ 6200 | consumed samples: 730112 | consumed tokens: 1495269376 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.083112E+00 | loss scale: 1024.0 | grad norm: 5.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.248 | TFLOPs: 42.14 | +[default7]: iteration 714/ 6200 | consumed samples: 731136 | consumed tokens: 1497366528 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.059713E+00 | loss scale: 1024.0 | grad norm: 6.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.529 | TFLOPs: 42.22 | +[default7]: iteration 715/ 6200 | consumed samples: 732160 | consumed tokens: 1499463680 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.064291E+00 | loss scale: 1024.0 | grad norm: 6.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.258 | TFLOPs: 42.14 | +[default7]: iteration 716/ 6200 | consumed samples: 733184 | consumed tokens: 1501560832 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.112985E+00 | loss scale: 1024.0 | grad norm: 5.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.142 | TFLOPs: 42.11 | +[default7]: iteration 717/ 6200 | consumed samples: 734208 | consumed tokens: 1503657984 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.063741E+00 | loss scale: 1024.0 | grad norm: 5.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.216 | TFLOPs: 42.13 | +[default7]: iteration 718/ 6200 | consumed samples: 735232 | consumed tokens: 1505755136 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.086721E+00 | loss scale: 1024.0 | grad norm: 7.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.111 | TFLOPs: 42.10 | +[default7]: iteration 719/ 6200 | consumed samples: 736256 | consumed tokens: 1507852288 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.077061E+00 | loss scale: 1024.0 | grad norm: 5.741 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.887 | TFLOPs: 42.03 | +[default7]: iteration 720/ 6200 | consumed samples: 737280 | consumed tokens: 1509949440 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.057336E+00 | loss scale: 1024.0 | grad norm: 6.920 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.996 | TFLOPs: 42.06 | +[default7]: iteration 721/ 6200 | consumed samples: 738304 | consumed tokens: 1512046592 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.086183E+00 | loss scale: 1024.0 | grad norm: 5.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.603 | TFLOPs: 41.94 | +[default7]: iteration 722/ 6200 | consumed samples: 739328 | consumed tokens: 1514143744 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.074320E+00 | loss scale: 1024.0 | grad norm: 6.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.857 | TFLOPs: 42.02 | +[default7]: iteration 723/ 6200 | consumed samples: 740352 | consumed tokens: 1516240896 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.066504E+00 | loss scale: 1024.0 | grad norm: 7.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.131 | TFLOPs: 42.10 | +[default7]: iteration 724/ 6200 | consumed samples: 741376 | consumed tokens: 1518338048 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.038088E+00 | loss scale: 1024.0 | grad norm: 5.894 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.497 | TFLOPs: 41.91 | +[default7]: iteration 725/ 6200 | consumed samples: 742400 | consumed tokens: 1520435200 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.075488E+00 | loss scale: 1024.0 | grad norm: 5.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.073 | TFLOPs: 42.09 | +[default7]: iteration 726/ 6200 | consumed samples: 743424 | consumed tokens: 1522532352 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.081691E+00 | loss scale: 1024.0 | grad norm: 5.722 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.164 | TFLOPs: 42.11 | +[default7]: iteration 727/ 6200 | consumed samples: 744448 | consumed tokens: 1524629504 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.074664E+00 | loss scale: 1024.0 | grad norm: 7.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.614 | TFLOPs: 42.25 | +[default7]: iteration 728/ 6200 | consumed samples: 745472 | consumed tokens: 1526726656 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.041119E+00 | loss scale: 1024.0 | grad norm: 6.077 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.578 | TFLOPs: 42.24 | +[default7]: iteration 729/ 6200 | consumed samples: 746496 | consumed tokens: 1528823808 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.055292E+00 | loss scale: 1024.0 | grad norm: 6.706 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.337 | TFLOPs: 42.17 | +[default7]: iteration 730/ 6200 | consumed samples: 747520 | consumed tokens: 1530920960 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.061126E+00 | loss scale: 1024.0 | grad norm: 8.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.322 | TFLOPs: 42.16 | +[default7]: iteration 731/ 6200 | consumed samples: 748544 | consumed tokens: 1533018112 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.039362E+00 | loss scale: 1024.0 | grad norm: 6.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.262 | TFLOPs: 42.14 | +[default7]: iteration 732/ 6200 | consumed samples: 749568 | consumed tokens: 1535115264 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.093653E+00 | loss scale: 1024.0 | grad norm: 6.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.238 | TFLOPs: 42.14 | +[default7]: iteration 733/ 6200 | consumed samples: 750592 | consumed tokens: 1537212416 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.066087E+00 | loss scale: 1024.0 | grad norm: 5.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.311 | TFLOPs: 42.16 | +[default7]: iteration 734/ 6200 | consumed samples: 751616 | consumed tokens: 1539309568 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.057455E+00 | loss scale: 1024.0 | grad norm: 6.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.332 | TFLOPs: 42.16 | +[default7]: iteration 735/ 6200 | consumed samples: 752640 | consumed tokens: 1541406720 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.050599E+00 | loss scale: 1024.0 | grad norm: 5.838 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.768 | TFLOPs: 41.99 | +[default7]: iteration 736/ 6200 | consumed samples: 753664 | consumed tokens: 1543503872 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.060544E+00 | loss scale: 1024.0 | grad norm: 6.838 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.104 | TFLOPs: 42.10 | +[default7]: iteration 737/ 6200 | consumed samples: 754688 | consumed tokens: 1545601024 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.078242E+00 | loss scale: 1024.0 | grad norm: 5.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.029 | TFLOPs: 42.07 | +[default7]: iteration 738/ 6200 | consumed samples: 755712 | consumed tokens: 1547698176 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.080486E+00 | loss scale: 1024.0 | grad norm: 7.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.932 | TFLOPs: 42.04 | +[default7]: iteration 739/ 6200 | consumed samples: 756736 | consumed tokens: 1549795328 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.042374E+00 | loss scale: 1024.0 | grad norm: 6.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.333 | TFLOPs: 41.86 | +[default7]: iteration 740/ 6200 | consumed samples: 757760 | consumed tokens: 1551892480 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.062490E+00 | loss scale: 1024.0 | grad norm: 6.772 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.375 | TFLOPs: 42.18 | +[default7]: iteration 741/ 6200 | consumed samples: 758784 | consumed tokens: 1553989632 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.091345E+00 | loss scale: 1024.0 | grad norm: 6.008 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.637 | TFLOPs: 42.26 | +[default7]: iteration 742/ 6200 | consumed samples: 759808 | consumed tokens: 1556086784 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.057942E+00 | loss scale: 1024.0 | grad norm: 6.602 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.155 | TFLOPs: 42.11 | +[default7]: iteration 743/ 6200 | consumed samples: 760832 | consumed tokens: 1558183936 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.066285E+00 | loss scale: 1024.0 | grad norm: 7.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.516 | TFLOPs: 42.22 | +[default7]: iteration 744/ 6200 | consumed samples: 761856 | consumed tokens: 1560281088 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.075448E+00 | loss scale: 1024.0 | grad norm: 6.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.451 | TFLOPs: 42.20 | +[default7]: iteration 745/ 6200 | consumed samples: 762880 | consumed tokens: 1562378240 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.067776E+00 | loss scale: 1024.0 | grad norm: 6.962 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.297 | TFLOPs: 42.15 | +[default7]: iteration 746/ 6200 | consumed samples: 763904 | consumed tokens: 1564475392 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.052769E+00 | loss scale: 1024.0 | grad norm: 6.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.003 | TFLOPs: 42.06 | +[default7]: iteration 747/ 6200 | consumed samples: 764928 | consumed tokens: 1566572544 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.051650E+00 | loss scale: 1024.0 | grad norm: 8.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.636 | TFLOPs: 42.26 | +[default7]: iteration 748/ 6200 | consumed samples: 765952 | consumed tokens: 1568669696 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.063760E+00 | loss scale: 1024.0 | grad norm: 5.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.962 | TFLOPs: 42.05 | +[default7]: iteration 749/ 6200 | consumed samples: 766976 | consumed tokens: 1570766848 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.033733E+00 | loss scale: 1024.0 | grad norm: 6.878 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.318 | TFLOPs: 42.16 | +[default7]: iteration 750/ 6200 | consumed samples: 768000 | consumed tokens: 1572864000 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.052578E+00 | loss scale: 1024.0 | grad norm: 6.060 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.411 | TFLOPs: 41.88 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 750 | lm loss value: 3.403007E+00 | lm loss PPL: 3.005433E+01 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default7]:---------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 750 | lm loss value: 1.962431E+00 | lm loss PPL: 7.116604E+00 | +[default7]:---------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 750 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 14:19:54,157] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step750 is begin to save! +[default0]:[2022-10-06 14:19:54,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,529] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,668] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,868] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,946] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:54,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:54,974] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:55,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:55,001] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:55,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:55,028] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:55,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:55,054] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:55,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:55,081] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:55,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:55,108] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:55,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:55,136] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_24-model_00-model_states.pt... +[default3]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 14:19:55,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:55,163] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:55,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:55,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:55,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:55,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 14:19:55,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 14:19:55,219] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/mp_rank_00_model_states.pt +[default0]:[2022-10-06 14:19:55,219] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:19:55,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 14:19:55,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 14:19:55,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 14:19:55,442] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 14:19:55,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 14:19:55,420] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 14:19:55,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 14:19:55,467] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 14:19:55,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 14:19:55,440] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 14:19:55,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 14:19:55,517] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 14:19:55,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 14:19:55,515] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 14:19:55,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 14:19:55,554] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 14:19:55,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 14:19:55,534] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 14:19:55,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 14:19:55,537] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 14:19:55,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 14:19:55,518] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 14:19:55,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 14:19:55,546] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 14:19:55,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 14:19:55,535] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 14:19:55,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 14:19:55,503] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 14:19:55,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 14:19:55,532] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 14:19:55,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 14:19:55,552] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 14:19:55,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 14:19:55,545] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 14:19:55,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 14:19:55,535] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 14:19:55,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 14:19:55,538] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 14:19:55,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 14:19:55,547] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 14:19:55,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 14:19:55,561] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 14:19:55,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 14:19:55,547] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 14:19:55,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 14:19:55,589] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 14:19:55,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 14:19:55,557] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default3]:[2022-10-06 14:19:55,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 14:19:55,561] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default4]:[2022-10-06 14:19:55,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 14:19:55,561] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default6]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default5]:[2022-10-06 14:19:55,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 14:19:55,575] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default0]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default0]: successfully saved checkpoint at iteration 750 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default6]:[2022-10-06 14:19:55,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 14:19:55,615] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default0]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default1]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default5]:[2022-10-06 14:19:55,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 14:19:55,624] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default7]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default7]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 14:19:55,639] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default7]:time (ms) | save-checkpoint: 1483.57 +[default1]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default5]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default0]:[2022-10-06 14:19:55,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 14:19:55,599] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default2]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default4]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default5]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default6]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default2]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default1]:[2022-10-06 14:19:55,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 14:19:55,597] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default4]:[2022-10-06 14:19:55,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 14:19:55,628] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step750/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default7]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default6]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default3]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default2]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default2]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default3]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default4]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default7]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default0]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default1]:[2022-10-06 14:19:55,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step750 is ready now! +[default7]: iteration 751/ 6200 | consumed samples: 769024 | consumed tokens: 1574961152 | elapsed time per iteration (s): 53.79 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.058366E+00 | loss scale: 1024.0 | grad norm: 5.951 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.036 | TFLOPs: 5.80 | +[default7]: iteration 752/ 6200 | consumed samples: 770048 | consumed tokens: 1577058304 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.047139E+00 | loss scale: 1024.0 | grad norm: 5.933 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.987 | TFLOPs: 42.06 | +[default7]: iteration 753/ 6200 | consumed samples: 771072 | consumed tokens: 1579155456 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.084960E+00 | loss scale: 1024.0 | grad norm: 7.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.076 | TFLOPs: 42.09 | +[default7]: iteration 754/ 6200 | consumed samples: 772096 | consumed tokens: 1581252608 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.048742E+00 | loss scale: 1024.0 | grad norm: 7.863 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.879 | TFLOPs: 42.03 | +[default7]: iteration 755/ 6200 | consumed samples: 773120 | consumed tokens: 1583349760 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.049424E+00 | loss scale: 1024.0 | grad norm: 6.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.860 | TFLOPs: 42.02 | +[default7]: iteration 756/ 6200 | consumed samples: 774144 | consumed tokens: 1585446912 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.049191E+00 | loss scale: 1024.0 | grad norm: 7.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.008 | TFLOPs: 42.07 | +[default7]: iteration 757/ 6200 | consumed samples: 775168 | consumed tokens: 1587544064 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.019501E+00 | loss scale: 1024.0 | grad norm: 7.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.870 | TFLOPs: 42.02 | +[default7]: iteration 758/ 6200 | consumed samples: 776192 | consumed tokens: 1589641216 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.059605E+00 | loss scale: 1024.0 | grad norm: 6.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.991 | TFLOPs: 42.06 | +[default7]: iteration 759/ 6200 | consumed samples: 777216 | consumed tokens: 1591738368 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.024982E+00 | loss scale: 1024.0 | grad norm: 6.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.202 | TFLOPs: 42.13 | +[default7]: iteration 760/ 6200 | consumed samples: 778240 | consumed tokens: 1593835520 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.037291E+00 | loss scale: 1024.0 | grad norm: 10.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.079 | TFLOPs: 42.09 | +[default7]: iteration 761/ 6200 | consumed samples: 779264 | consumed tokens: 1595932672 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.056539E+00 | loss scale: 1024.0 | grad norm: 6.834 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.068 | TFLOPs: 42.08 | +[default7]: iteration 762/ 6200 | consumed samples: 780288 | consumed tokens: 1598029824 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.077612E+00 | loss scale: 1024.0 | grad norm: 10.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.087 | TFLOPs: 42.09 | +[default7]: iteration 763/ 6200 | consumed samples: 781312 | consumed tokens: 1600126976 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.049597E+00 | loss scale: 1024.0 | grad norm: 7.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.553 | TFLOPs: 42.23 | +[default7]: iteration 764/ 6200 | consumed samples: 782336 | consumed tokens: 1602224128 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.066947E+00 | loss scale: 1024.0 | grad norm: 6.718 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.166 | TFLOPs: 42.11 | +[default7]: iteration 765/ 6200 | consumed samples: 783360 | consumed tokens: 1604321280 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.069366E+00 | loss scale: 1024.0 | grad norm: 7.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.574 | TFLOPs: 41.93 | +[default7]: iteration 766/ 6200 | consumed samples: 784384 | consumed tokens: 1606418432 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.051847E+00 | loss scale: 1024.0 | grad norm: 5.925 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.879 | TFLOPs: 42.03 | +[default7]: iteration 767/ 6200 | consumed samples: 785408 | consumed tokens: 1608515584 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.051230E+00 | loss scale: 1024.0 | grad norm: 5.607 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.909 | TFLOPs: 42.04 | +[default7]: iteration 768/ 6200 | consumed samples: 786432 | consumed tokens: 1610612736 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.062181E+00 | loss scale: 1024.0 | grad norm: 7.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.549 | TFLOPs: 41.93 | +[default7]: iteration 769/ 6200 | consumed samples: 787456 | consumed tokens: 1612709888 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.042650E+00 | loss scale: 1024.0 | grad norm: 7.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.579 | TFLOPs: 41.94 | +[default7]: iteration 770/ 6200 | consumed samples: 788480 | consumed tokens: 1614807040 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.013668E+00 | loss scale: 1024.0 | grad norm: 6.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.982 | TFLOPs: 42.06 | +[default7]: iteration 771/ 6200 | consumed samples: 789504 | consumed tokens: 1616904192 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.059178E+00 | loss scale: 1024.0 | grad norm: 7.637 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.298 | TFLOPs: 41.85 | +[default7]: iteration 772/ 6200 | consumed samples: 790528 | consumed tokens: 1619001344 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.069991E+00 | loss scale: 1024.0 | grad norm: 8.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.707 | TFLOPs: 41.97 | +[default7]: iteration 773/ 6200 | consumed samples: 791552 | consumed tokens: 1621098496 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.052222E+00 | loss scale: 1024.0 | grad norm: 6.715 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.571 | TFLOPs: 41.93 | +[default7]: iteration 774/ 6200 | consumed samples: 792576 | consumed tokens: 1623195648 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.037794E+00 | loss scale: 1024.0 | grad norm: 6.923 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.148 | TFLOPs: 42.11 | +[default7]: iteration 775/ 6200 | consumed samples: 793600 | consumed tokens: 1625292800 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.065675E+00 | loss scale: 1024.0 | grad norm: 7.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.277 | TFLOPs: 42.15 | +[default7]: iteration 776/ 6200 | consumed samples: 794624 | consumed tokens: 1627389952 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.050198E+00 | loss scale: 1024.0 | grad norm: 5.930 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.128 | TFLOPs: 42.10 | +[default7]: iteration 777/ 6200 | consumed samples: 795648 | consumed tokens: 1629487104 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.053440E+00 | loss scale: 1024.0 | grad norm: 7.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.182 | TFLOPs: 42.12 | +[default7]: iteration 778/ 6200 | consumed samples: 796672 | consumed tokens: 1631584256 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.037247E+00 | loss scale: 1024.0 | grad norm: 6.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.130 | TFLOPs: 42.10 | +[default7]: iteration 779/ 6200 | consumed samples: 797696 | consumed tokens: 1633681408 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.026917E+00 | loss scale: 1024.0 | grad norm: 7.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.946 | TFLOPs: 42.05 | +[default7]: iteration 780/ 6200 | consumed samples: 798720 | consumed tokens: 1635778560 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.042857E+00 | loss scale: 1024.0 | grad norm: 9.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.213 | TFLOPs: 42.13 | +[default7]: iteration 781/ 6200 | consumed samples: 799744 | consumed tokens: 1637875712 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.061282E+00 | loss scale: 1024.0 | grad norm: 5.829 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.307 | TFLOPs: 42.16 | +[default7]: iteration 782/ 6200 | consumed samples: 800768 | consumed tokens: 1639972864 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.038735E+00 | loss scale: 1024.0 | grad norm: 6.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.259 | TFLOPs: 42.14 | +[default7]: iteration 783/ 6200 | consumed samples: 801792 | consumed tokens: 1642070016 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.059796E+00 | loss scale: 1024.0 | grad norm: 8.016 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.338 | TFLOPs: 42.17 | +[default7]: iteration 784/ 6200 | consumed samples: 802816 | consumed tokens: 1644167168 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.085276E+00 | loss scale: 1024.0 | grad norm: 7.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.528 | TFLOPs: 42.22 | +[default7]: iteration 785/ 6200 | consumed samples: 803840 | consumed tokens: 1646264320 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.063754E+00 | loss scale: 1024.0 | grad norm: 7.043 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.906 | TFLOPs: 42.03 | +[default7]: iteration 786/ 6200 | consumed samples: 804864 | consumed tokens: 1648361472 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.061047E+00 | loss scale: 1024.0 | grad norm: 7.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.903 | TFLOPs: 42.03 | +[default7]: iteration 787/ 6200 | consumed samples: 805888 | consumed tokens: 1650458624 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.091562E+00 | loss scale: 1024.0 | grad norm: 6.004 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.255 | TFLOPs: 42.14 | +[default7]: iteration 788/ 6200 | consumed samples: 806912 | consumed tokens: 1652555776 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.050306E+00 | loss scale: 1024.0 | grad norm: 6.770 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.141 | TFLOPs: 42.11 | +[default7]: iteration 789/ 6200 | consumed samples: 807936 | consumed tokens: 1654652928 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.013119E+00 | loss scale: 1024.0 | grad norm: 6.801 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.249 | TFLOPs: 42.14 | +[default7]: iteration 790/ 6200 | consumed samples: 808960 | consumed tokens: 1656750080 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.063072E+00 | loss scale: 1024.0 | grad norm: 5.891 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.388 | TFLOPs: 42.18 | +[default7]: iteration 791/ 6200 | consumed samples: 809984 | consumed tokens: 1658847232 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.060043E+00 | loss scale: 1024.0 | grad norm: 8.044 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.388 | TFLOPs: 42.18 | +[default7]: iteration 792/ 6200 | consumed samples: 811008 | consumed tokens: 1660944384 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.060771E+00 | loss scale: 1024.0 | grad norm: 8.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.610 | TFLOPs: 42.25 | +[default7]: iteration 793/ 6200 | consumed samples: 812032 | consumed tokens: 1663041536 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.069264E+00 | loss scale: 1024.0 | grad norm: 6.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.042 | TFLOPs: 42.08 | +[default7]: iteration 794/ 6200 | consumed samples: 813056 | consumed tokens: 1665138688 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.044250E+00 | loss scale: 1024.0 | grad norm: 5.976 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.856 | TFLOPs: 42.02 | +[default7]: iteration 795/ 6200 | consumed samples: 814080 | consumed tokens: 1667235840 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.046430E+00 | loss scale: 1024.0 | grad norm: 9.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.488 | TFLOPs: 41.91 | +[default7]: iteration 796/ 6200 | consumed samples: 815104 | consumed tokens: 1669332992 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.047172E+00 | loss scale: 1024.0 | grad norm: 6.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.629 | TFLOPs: 41.95 | +[default7]: iteration 797/ 6200 | consumed samples: 816128 | consumed tokens: 1671430144 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.060445E+00 | loss scale: 1024.0 | grad norm: 6.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.346 | TFLOPs: 42.17 | +[default7]: iteration 798/ 6200 | consumed samples: 817152 | consumed tokens: 1673527296 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.016848E+00 | loss scale: 1024.0 | grad norm: 6.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.678 | TFLOPs: 42.27 | +[default7]: iteration 799/ 6200 | consumed samples: 818176 | consumed tokens: 1675624448 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.021233E+00 | loss scale: 1024.0 | grad norm: 7.870 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.179 | TFLOPs: 42.12 | +[default7]: iteration 800/ 6200 | consumed samples: 819200 | consumed tokens: 1677721600 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.048793E+00 | loss scale: 1024.0 | grad norm: 6.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.537 | TFLOPs: 42.23 | +[default7]: iteration 801/ 6200 | consumed samples: 820224 | consumed tokens: 1679818752 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.032413E+00 | loss scale: 1024.0 | grad norm: 6.717 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.035 | TFLOPs: 42.07 | +[default7]: iteration 802/ 6200 | consumed samples: 821248 | consumed tokens: 1681915904 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.038586E+00 | loss scale: 1024.0 | grad norm: 5.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.935 | TFLOPs: 42.04 | +[default7]: iteration 803/ 6200 | consumed samples: 822272 | consumed tokens: 1684013056 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.054937E+00 | loss scale: 1024.0 | grad norm: 6.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.349 | TFLOPs: 42.17 | +[default7]: iteration 804/ 6200 | consumed samples: 823296 | consumed tokens: 1686110208 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.075244E+00 | loss scale: 1024.0 | grad norm: 8.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.804 | TFLOPs: 42.00 | +[default7]: iteration 805/ 6200 | consumed samples: 824320 | consumed tokens: 1688207360 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.048973E+00 | loss scale: 1024.0 | grad norm: 6.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.962 | TFLOPs: 42.05 | +[default7]: iteration 806/ 6200 | consumed samples: 825344 | consumed tokens: 1690304512 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.025335E+00 | loss scale: 1024.0 | grad norm: 6.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.443 | TFLOPs: 42.20 | +[default7]: iteration 807/ 6200 | consumed samples: 826368 | consumed tokens: 1692401664 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.048753E+00 | loss scale: 1024.0 | grad norm: 7.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.834 | TFLOPs: 42.01 | +[default7]: iteration 808/ 6200 | consumed samples: 827392 | consumed tokens: 1694498816 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.033502E+00 | loss scale: 1024.0 | grad norm: 5.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.269 | TFLOPs: 42.15 | +[default7]: iteration 809/ 6200 | consumed samples: 828416 | consumed tokens: 1696595968 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.038966E+00 | loss scale: 1024.0 | grad norm: 6.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.447 | TFLOPs: 41.89 | +[default7]: iteration 810/ 6200 | consumed samples: 829440 | consumed tokens: 1698693120 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.040459E+00 | loss scale: 1024.0 | grad norm: 5.801 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.820 | TFLOPs: 42.01 | +[default7]: iteration 811/ 6200 | consumed samples: 830464 | consumed tokens: 1700790272 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.055404E+00 | loss scale: 1024.0 | grad norm: 5.634 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.740 | TFLOPs: 42.29 | +[default7]: iteration 812/ 6200 | consumed samples: 831488 | consumed tokens: 1702887424 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.028927E+00 | loss scale: 1024.0 | grad norm: 5.641 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.457 | TFLOPs: 42.20 | +[default7]: iteration 813/ 6200 | consumed samples: 832512 | consumed tokens: 1704984576 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.074240E+00 | loss scale: 1024.0 | grad norm: 5.686 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.936 | TFLOPs: 42.04 | +[default7]: iteration 814/ 6200 | consumed samples: 833536 | consumed tokens: 1707081728 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.034485E+00 | loss scale: 1024.0 | grad norm: 7.852 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.141 | TFLOPs: 42.11 | +[default7]: iteration 815/ 6200 | consumed samples: 834560 | consumed tokens: 1709178880 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.012164E+00 | loss scale: 1024.0 | grad norm: 6.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.136 | TFLOPs: 41.80 | +[default7]: iteration 816/ 6200 | consumed samples: 835584 | consumed tokens: 1711276032 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.037641E+00 | loss scale: 1024.0 | grad norm: 7.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.768 | TFLOPs: 41.99 | +[default7]: iteration 817/ 6200 | consumed samples: 836608 | consumed tokens: 1713373184 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.059701E+00 | loss scale: 1024.0 | grad norm: 7.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.172 | TFLOPs: 41.81 | +[default7]: iteration 818/ 6200 | consumed samples: 837632 | consumed tokens: 1715470336 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.056734E+00 | loss scale: 1024.0 | grad norm: 6.692 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.665 | TFLOPs: 41.96 | +[default7]: iteration 819/ 6200 | consumed samples: 838656 | consumed tokens: 1717567488 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.036662E+00 | loss scale: 1024.0 | grad norm: 5.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.629 | TFLOPs: 41.95 | +[default7]: iteration 820/ 6200 | consumed samples: 839680 | consumed tokens: 1719664640 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.058619E+00 | loss scale: 1024.0 | grad norm: 5.605 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.935 | TFLOPs: 42.04 | +[default7]: iteration 821/ 6200 | consumed samples: 840704 | consumed tokens: 1721761792 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.025011E+00 | loss scale: 1024.0 | grad norm: 5.721 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.304 | TFLOPs: 42.16 | +[default7]: iteration 822/ 6200 | consumed samples: 841728 | consumed tokens: 1723858944 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.047239E+00 | loss scale: 1024.0 | grad norm: 5.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.965 | TFLOPs: 42.05 | +[default7]: iteration 823/ 6200 | consumed samples: 842752 | consumed tokens: 1725956096 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.053573E+00 | loss scale: 1024.0 | grad norm: 5.856 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.867 | TFLOPs: 42.02 | +[default7]: iteration 824/ 6200 | consumed samples: 843776 | consumed tokens: 1728053248 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.046160E+00 | loss scale: 1024.0 | grad norm: 6.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.087 | TFLOPs: 42.09 | +[default7]: iteration 825/ 6200 | consumed samples: 844800 | consumed tokens: 1730150400 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.045958E+00 | loss scale: 1024.0 | grad norm: 6.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.783 | TFLOPs: 42.00 | +[default7]: iteration 826/ 6200 | consumed samples: 845824 | consumed tokens: 1732247552 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.037469E+00 | loss scale: 1024.0 | grad norm: 7.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.268 | TFLOPs: 41.84 | +[default7]: iteration 827/ 6200 | consumed samples: 846848 | consumed tokens: 1734344704 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.025526E+00 | loss scale: 1024.0 | grad norm: 7.006 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.003 | TFLOPs: 41.76 | +[default7]: iteration 828/ 6200 | consumed samples: 847872 | consumed tokens: 1736441856 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.044968E+00 | loss scale: 1024.0 | grad norm: 6.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.070 | TFLOPs: 41.78 | +[default7]: iteration 829/ 6200 | consumed samples: 848896 | consumed tokens: 1738539008 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.061302E+00 | loss scale: 1024.0 | grad norm: 5.944 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.813 | TFLOPs: 42.01 | +[default7]: iteration 830/ 6200 | consumed samples: 849920 | consumed tokens: 1740636160 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.056875E+00 | loss scale: 1024.0 | grad norm: 8.614 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.527 | TFLOPs: 42.22 | +[default7]: iteration 831/ 6200 | consumed samples: 850944 | consumed tokens: 1742733312 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.042027E+00 | loss scale: 1024.0 | grad norm: 5.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.290 | TFLOPs: 41.85 | +[default7]: iteration 832/ 6200 | consumed samples: 851968 | consumed tokens: 1744830464 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.057919E+00 | loss scale: 1024.0 | grad norm: 7.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.755 | TFLOPs: 41.68 | +[default7]: iteration 833/ 6200 | consumed samples: 852992 | consumed tokens: 1746927616 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.035837E+00 | loss scale: 1024.0 | grad norm: 6.743 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.909 | TFLOPs: 41.73 | +[default7]: iteration 834/ 6200 | consumed samples: 854016 | consumed tokens: 1749024768 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.032338E+00 | loss scale: 1024.0 | grad norm: 5.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.252 | TFLOPs: 41.84 | +[default7]: iteration 835/ 6200 | consumed samples: 855040 | consumed tokens: 1751121920 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.050802E+00 | loss scale: 1024.0 | grad norm: 5.703 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.929 | TFLOPs: 41.74 | +[default7]: iteration 836/ 6200 | consumed samples: 856064 | consumed tokens: 1753219072 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.034328E+00 | loss scale: 1024.0 | grad norm: 5.832 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.242 | TFLOPs: 41.83 | +[default7]: iteration 837/ 6200 | consumed samples: 857088 | consumed tokens: 1755316224 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.032706E+00 | loss scale: 1024.0 | grad norm: 6.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.536 | TFLOPs: 41.92 | +[default7]: iteration 838/ 6200 | consumed samples: 858112 | consumed tokens: 1757413376 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.052304E+00 | loss scale: 1024.0 | grad norm: 5.801 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.060 | TFLOPs: 41.78 | +[default7]: iteration 839/ 6200 | consumed samples: 859136 | consumed tokens: 1759510528 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.044542E+00 | loss scale: 1024.0 | grad norm: 6.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.287 | TFLOPs: 42.15 | +[default7]: iteration 840/ 6200 | consumed samples: 860160 | consumed tokens: 1761607680 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.041959E+00 | loss scale: 1024.0 | grad norm: 5.894 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.922 | TFLOPs: 42.04 | +[default7]: iteration 841/ 6200 | consumed samples: 861184 | consumed tokens: 1763704832 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.031493E+00 | loss scale: 1024.0 | grad norm: 6.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.285 | TFLOPs: 41.85 | +[default7]: iteration 842/ 6200 | consumed samples: 862208 | consumed tokens: 1765801984 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.048186E+00 | loss scale: 1024.0 | grad norm: 6.762 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.382 | TFLOPs: 41.88 | +[default7]: iteration 843/ 6200 | consumed samples: 863232 | consumed tokens: 1767899136 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.039033E+00 | loss scale: 1024.0 | grad norm: 5.815 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.361 | TFLOPs: 41.87 | +[default7]: iteration 844/ 6200 | consumed samples: 864256 | consumed tokens: 1769996288 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.013198E+00 | loss scale: 1024.0 | grad norm: 7.992 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.941 | TFLOPs: 42.05 | +[default7]: iteration 845/ 6200 | consumed samples: 865280 | consumed tokens: 1772093440 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.030252E+00 | loss scale: 1024.0 | grad norm: 10.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.378 | TFLOPs: 41.87 | +[default7]: iteration 846/ 6200 | consumed samples: 866304 | consumed tokens: 1774190592 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.028407E+00 | loss scale: 1024.0 | grad norm: 5.651 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.291 | TFLOPs: 41.85 | +[default7]: iteration 847/ 6200 | consumed samples: 867328 | consumed tokens: 1776287744 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.060054E+00 | loss scale: 1024.0 | grad norm: 6.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.774 | TFLOPs: 41.99 | +[default7]: iteration 848/ 6200 | consumed samples: 868352 | consumed tokens: 1778384896 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.040329E+00 | loss scale: 1024.0 | grad norm: 6.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.162 | TFLOPs: 41.81 | +[default7]: iteration 849/ 6200 | consumed samples: 869376 | consumed tokens: 1780482048 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.013659E+00 | loss scale: 1024.0 | grad norm: 5.757 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.518 | TFLOPs: 41.92 | +[default7]: iteration 850/ 6200 | consumed samples: 870400 | consumed tokens: 1782579200 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.038299E+00 | loss scale: 1024.0 | grad norm: 6.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.257 | TFLOPs: 42.14 | +[default7]: iteration 851/ 6200 | consumed samples: 871424 | consumed tokens: 1784676352 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.031329E+00 | loss scale: 1024.0 | grad norm: 7.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.088 | TFLOPs: 42.09 | +[default7]: iteration 852/ 6200 | consumed samples: 872448 | consumed tokens: 1786773504 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.020488E+00 | loss scale: 1024.0 | grad norm: 6.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.083 | TFLOPs: 42.09 | +[default7]: iteration 853/ 6200 | consumed samples: 873472 | consumed tokens: 1788870656 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.030377E+00 | loss scale: 1024.0 | grad norm: 6.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.418 | TFLOPs: 41.89 | +[default7]: iteration 854/ 6200 | consumed samples: 874496 | consumed tokens: 1790967808 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.008564E+00 | loss scale: 1024.0 | grad norm: 7.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.192 | TFLOPs: 41.82 | +[default7]: iteration 855/ 6200 | consumed samples: 875520 | consumed tokens: 1793064960 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.055538E+00 | loss scale: 1024.0 | grad norm: 7.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.285 | TFLOPs: 42.15 | +[default7]: iteration 856/ 6200 | consumed samples: 876544 | consumed tokens: 1795162112 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.042592E+00 | loss scale: 1024.0 | grad norm: 7.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.026 | TFLOPs: 42.07 | +[default7]: iteration 857/ 6200 | consumed samples: 877568 | consumed tokens: 1797259264 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.050631E+00 | loss scale: 1024.0 | grad norm: 5.949 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.027 | TFLOPs: 42.07 | +[default7]: iteration 858/ 6200 | consumed samples: 878592 | consumed tokens: 1799356416 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.036022E+00 | loss scale: 1024.0 | grad norm: 6.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.769 | TFLOPs: 41.99 | +[default7]: iteration 859/ 6200 | consumed samples: 879616 | consumed tokens: 1801453568 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.035196E+00 | loss scale: 1024.0 | grad norm: 6.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.838 | TFLOPs: 42.01 | +[default7]: iteration 860/ 6200 | consumed samples: 880640 | consumed tokens: 1803550720 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.022081E+00 | loss scale: 1024.0 | grad norm: 5.791 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.495 | TFLOPs: 42.21 | +[default7]: iteration 861/ 6200 | consumed samples: 881664 | consumed tokens: 1805647872 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.014489E+00 | loss scale: 1024.0 | grad norm: 6.905 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.097 | TFLOPs: 42.09 | +[default7]: iteration 862/ 6200 | consumed samples: 882688 | consumed tokens: 1807745024 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.013909E+00 | loss scale: 1024.0 | grad norm: 6.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.339 | TFLOPs: 42.17 | +[default7]: iteration 863/ 6200 | consumed samples: 883712 | consumed tokens: 1809842176 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.046166E+00 | loss scale: 1024.0 | grad norm: 6.905 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.893 | TFLOPs: 42.03 | +[default7]: iteration 864/ 6200 | consumed samples: 884736 | consumed tokens: 1811939328 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.067894E+00 | loss scale: 1024.0 | grad norm: 6.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.950 | TFLOPs: 42.05 | +[default7]: iteration 865/ 6200 | consumed samples: 885760 | consumed tokens: 1814036480 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.007130E+00 | loss scale: 1024.0 | grad norm: 5.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.962 | TFLOPs: 41.75 | +[default7]: iteration 866/ 6200 | consumed samples: 886784 | consumed tokens: 1816133632 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.041279E+00 | loss scale: 1024.0 | grad norm: 5.985 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.680 | TFLOPs: 41.97 | +[default7]: iteration 867/ 6200 | consumed samples: 887808 | consumed tokens: 1818230784 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.028520E+00 | loss scale: 1024.0 | grad norm: 6.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.009 | TFLOPs: 41.76 | +[default7]: iteration 868/ 6200 | consumed samples: 888832 | consumed tokens: 1820327936 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.024845E+00 | loss scale: 1024.0 | grad norm: 6.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.736 | TFLOPs: 41.68 | +[default7]: iteration 869/ 6200 | consumed samples: 889856 | consumed tokens: 1822425088 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.035555E+00 | loss scale: 1024.0 | grad norm: 5.944 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.381 | TFLOPs: 41.87 | +[default7]: iteration 870/ 6200 | consumed samples: 890880 | consumed tokens: 1824522240 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.021579E+00 | loss scale: 1024.0 | grad norm: 5.923 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.944 | TFLOPs: 42.05 | +[default7]: iteration 871/ 6200 | consumed samples: 891904 | consumed tokens: 1826619392 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.004729E+00 | loss scale: 1024.0 | grad norm: 6.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.458 | TFLOPs: 42.20 | +[default7]: iteration 872/ 6200 | consumed samples: 892928 | consumed tokens: 1828716544 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.039126E+00 | loss scale: 1024.0 | grad norm: 6.025 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.516 | TFLOPs: 41.92 | +[default7]: iteration 873/ 6200 | consumed samples: 893952 | consumed tokens: 1830813696 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.000637E+00 | loss scale: 1024.0 | grad norm: 6.047 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.113 | TFLOPs: 42.10 | +[default7]: iteration 874/ 6200 | consumed samples: 894976 | consumed tokens: 1832910848 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.029014E+00 | loss scale: 1024.0 | grad norm: 5.772 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.410 | TFLOPs: 41.88 | +[default7]: iteration 875/ 6200 | consumed samples: 896000 | consumed tokens: 1835008000 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.031846E+00 | loss scale: 1024.0 | grad norm: 6.977 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.937 | TFLOPs: 42.04 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 875 | lm loss value: 3.424131E+00 | lm loss PPL: 3.069597E+01 | +[default7]:---------------------------------------------------------------------------------------------------------- +[default7]:---------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 875 | lm loss value: 1.944732E+00 | lm loss PPL: 6.991758E+00 | +[default7]:---------------------------------------------------------------------------------------------- +[default7]: iteration 876/ 6200 | consumed samples: 897024 | consumed tokens: 1837105152 | elapsed time per iteration (s): 51.81 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.020419E+00 | loss scale: 1024.0 | grad norm: 8.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.764 | TFLOPs: 6.02 | +[default7]: iteration 877/ 6200 | consumed samples: 898048 | consumed tokens: 1839202304 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.022913E+00 | loss scale: 1024.0 | grad norm: 5.771 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.617 | TFLOPs: 41.95 | +[default7]: iteration 878/ 6200 | consumed samples: 899072 | consumed tokens: 1841299456 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.038641E+00 | loss scale: 1024.0 | grad norm: 6.583 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.330 | TFLOPs: 42.16 | +[default7]: iteration 879/ 6200 | consumed samples: 900096 | consumed tokens: 1843396608 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.051353E+00 | loss scale: 1024.0 | grad norm: 7.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.231 | TFLOPs: 42.13 | +[default7]: iteration 880/ 6200 | consumed samples: 901120 | consumed tokens: 1845493760 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.046439E+00 | loss scale: 1024.0 | grad norm: 6.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.256 | TFLOPs: 41.84 | +[default7]: iteration 881/ 6200 | consumed samples: 902144 | consumed tokens: 1847590912 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.023401E+00 | loss scale: 1024.0 | grad norm: 6.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.879 | TFLOPs: 42.03 | +[default7]: iteration 882/ 6200 | consumed samples: 903168 | consumed tokens: 1849688064 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.045967E+00 | loss scale: 1024.0 | grad norm: 6.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.977 | TFLOPs: 42.06 | +[default7]: iteration 883/ 6200 | consumed samples: 904192 | consumed tokens: 1851785216 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.036798E+00 | loss scale: 1024.0 | grad norm: 5.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.330 | TFLOPs: 41.86 | +[default7]: iteration 884/ 6200 | consumed samples: 905216 | consumed tokens: 1853882368 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.008371E+00 | loss scale: 1024.0 | grad norm: 6.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.475 | TFLOPs: 41.90 | +[default7]: iteration 885/ 6200 | consumed samples: 906240 | consumed tokens: 1855979520 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.050002E+00 | loss scale: 1024.0 | grad norm: 7.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.236 | TFLOPs: 42.14 | +[default7]: iteration 886/ 6200 | consumed samples: 907264 | consumed tokens: 1858076672 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.044703E+00 | loss scale: 1024.0 | grad norm: 5.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.364 | TFLOPs: 42.17 | +[default7]: iteration 887/ 6200 | consumed samples: 908288 | consumed tokens: 1860173824 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.058125E+00 | loss scale: 1024.0 | grad norm: 8.069 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.215 | TFLOPs: 42.13 | +[default7]: iteration 888/ 6200 | consumed samples: 909312 | consumed tokens: 1862270976 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.040206E+00 | loss scale: 1024.0 | grad norm: 6.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.156 | TFLOPs: 42.11 | +[default7]: iteration 889/ 6200 | consumed samples: 910336 | consumed tokens: 1864368128 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.021282E+00 | loss scale: 1024.0 | grad norm: 5.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.161 | TFLOPs: 42.11 | +[default7]: iteration 890/ 6200 | consumed samples: 911360 | consumed tokens: 1866465280 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.033593E+00 | loss scale: 1024.0 | grad norm: 6.741 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.217 | TFLOPs: 42.13 | +[default7]: iteration 891/ 6200 | consumed samples: 912384 | consumed tokens: 1868562432 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.044933E+00 | loss scale: 1024.0 | grad norm: 5.932 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.934 | TFLOPs: 42.04 | +[default7]: iteration 892/ 6200 | consumed samples: 913408 | consumed tokens: 1870659584 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.020993E+00 | loss scale: 1024.0 | grad norm: 6.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.901 | TFLOPs: 41.73 | +[default7]: iteration 893/ 6200 | consumed samples: 914432 | consumed tokens: 1872756736 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.026403E+00 | loss scale: 1024.0 | grad norm: 6.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.345 | TFLOPs: 41.86 | +[default7]: iteration 894/ 6200 | consumed samples: 915456 | consumed tokens: 1874853888 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.055792E+00 | loss scale: 1024.0 | grad norm: 5.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.595 | TFLOPs: 42.24 | +[default7]: iteration 895/ 6200 | consumed samples: 916480 | consumed tokens: 1876951040 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.044415E+00 | loss scale: 1024.0 | grad norm: 6.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.296 | TFLOPs: 42.15 | +[default7]: iteration 896/ 6200 | consumed samples: 917504 | consumed tokens: 1879048192 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.038395E+00 | loss scale: 1024.0 | grad norm: 6.613 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.287 | TFLOPs: 42.15 | +[default7]: iteration 897/ 6200 | consumed samples: 918528 | consumed tokens: 1881145344 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.018606E+00 | loss scale: 1024.0 | grad norm: 7.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.014 | TFLOPs: 42.07 | +[default7]: iteration 898/ 6200 | consumed samples: 919552 | consumed tokens: 1883242496 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.028332E+00 | loss scale: 1024.0 | grad norm: 6.903 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.433 | TFLOPs: 42.20 | +[default7]: iteration 899/ 6200 | consumed samples: 920576 | consumed tokens: 1885339648 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.022314E+00 | loss scale: 1024.0 | grad norm: 5.826 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.140 | TFLOPs: 42.11 | +[default7]: iteration 900/ 6200 | consumed samples: 921600 | consumed tokens: 1887436800 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.038650E+00 | loss scale: 1024.0 | grad norm: 6.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.223 | TFLOPs: 42.13 | +[default7]: iteration 901/ 6200 | consumed samples: 922624 | consumed tokens: 1889533952 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.990420E+00 | loss scale: 1024.0 | grad norm: 7.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.605 | TFLOPs: 42.25 | +[default7]: iteration 902/ 6200 | consumed samples: 923648 | consumed tokens: 1891631104 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.009160E+00 | loss scale: 1024.0 | grad norm: 5.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.276 | TFLOPs: 42.15 | +[default7]: iteration 903/ 6200 | consumed samples: 924672 | consumed tokens: 1893728256 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.032473E+00 | loss scale: 1024.0 | grad norm: 6.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.287 | TFLOPs: 42.15 | +[default7]: iteration 904/ 6200 | consumed samples: 925696 | consumed tokens: 1895825408 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.026912E+00 | loss scale: 1024.0 | grad norm: 7.030 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.040 | TFLOPs: 42.08 | +[default7]: iteration 905/ 6200 | consumed samples: 926720 | consumed tokens: 1897922560 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.022072E+00 | loss scale: 1024.0 | grad norm: 5.850 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.928 | TFLOPs: 42.04 | +[default7]: iteration 906/ 6200 | consumed samples: 927744 | consumed tokens: 1900019712 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.018427E+00 | loss scale: 1024.0 | grad norm: 7.901 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.705 | TFLOPs: 41.97 | +[default7]: iteration 907/ 6200 | consumed samples: 928768 | consumed tokens: 1902116864 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.054940E+00 | loss scale: 1024.0 | grad norm: 8.906 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.588 | TFLOPs: 41.94 | +[default7]: iteration 908/ 6200 | consumed samples: 929792 | consumed tokens: 1904214016 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.002634E+00 | loss scale: 1024.0 | grad norm: 5.692 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.800 | TFLOPs: 42.00 | +[default7]: iteration 909/ 6200 | consumed samples: 930816 | consumed tokens: 1906311168 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.027771E+00 | loss scale: 1024.0 | grad norm: 7.619 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.217 | TFLOPs: 42.13 | +[default7]: iteration 910/ 6200 | consumed samples: 931840 | consumed tokens: 1908408320 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.033079E+00 | loss scale: 1024.0 | grad norm: 9.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.195 | TFLOPs: 42.12 | +[default7]: iteration 911/ 6200 | consumed samples: 932864 | consumed tokens: 1910505472 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.023040E+00 | loss scale: 1024.0 | grad norm: 6.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.968 | TFLOPs: 42.05 | +[default7]: iteration 912/ 6200 | consumed samples: 933888 | consumed tokens: 1912602624 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.035340E+00 | loss scale: 1024.0 | grad norm: 6.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.885 | TFLOPs: 42.03 | +[default7]: iteration 913/ 6200 | consumed samples: 934912 | consumed tokens: 1914699776 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.026740E+00 | loss scale: 1024.0 | grad norm: 9.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.035 | TFLOPs: 42.07 | +[default7]: iteration 914/ 6200 | consumed samples: 935936 | consumed tokens: 1916796928 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.999842E+00 | loss scale: 1024.0 | grad norm: 6.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.182 | TFLOPs: 42.12 | +[default7]: iteration 915/ 6200 | consumed samples: 936960 | consumed tokens: 1918894080 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.015755E+00 | loss scale: 1024.0 | grad norm: 7.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.053 | TFLOPs: 42.08 | +[default7]: iteration 916/ 6200 | consumed samples: 937984 | consumed tokens: 1920991232 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.022737E+00 | loss scale: 1024.0 | grad norm: 9.569 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.861 | TFLOPs: 42.02 | +[default7]: iteration 917/ 6200 | consumed samples: 939008 | consumed tokens: 1923088384 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.034194E+00 | loss scale: 1024.0 | grad norm: 7.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.312 | TFLOPs: 42.16 | +[default7]: iteration 918/ 6200 | consumed samples: 940032 | consumed tokens: 1925185536 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.014122E+00 | loss scale: 1024.0 | grad norm: 7.923 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.793 | TFLOPs: 42.00 | +[default7]: iteration 919/ 6200 | consumed samples: 941056 | consumed tokens: 1927282688 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.005148E+00 | loss scale: 1024.0 | grad norm: 7.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.389 | TFLOPs: 42.18 | +[default7]: iteration 920/ 6200 | consumed samples: 942080 | consumed tokens: 1929379840 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.025376E+00 | loss scale: 1024.0 | grad norm: 7.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.154 | TFLOPs: 42.11 | +[default7]: iteration 921/ 6200 | consumed samples: 943104 | consumed tokens: 1931476992 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.014624E+00 | loss scale: 1024.0 | grad norm: 6.698 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.454 | TFLOPs: 41.90 | +[default7]: iteration 922/ 6200 | consumed samples: 944128 | consumed tokens: 1933574144 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.005284E+00 | loss scale: 1024.0 | grad norm: 7.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.447 | TFLOPs: 41.89 | +[default7]: iteration 923/ 6200 | consumed samples: 945152 | consumed tokens: 1935671296 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.003960E+00 | loss scale: 1024.0 | grad norm: 6.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.712 | TFLOPs: 41.98 | +[default7]: iteration 924/ 6200 | consumed samples: 946176 | consumed tokens: 1937768448 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.013716E+00 | loss scale: 1024.0 | grad norm: 5.765 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.943 | TFLOPs: 42.05 | +[default7]: iteration 925/ 6200 | consumed samples: 947200 | consumed tokens: 1939865600 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.000377E+00 | loss scale: 1024.0 | grad norm: 6.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.204 | TFLOPs: 41.82 | +[default7]: iteration 926/ 6200 | consumed samples: 948224 | consumed tokens: 1941962752 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.027696E+00 | loss scale: 1024.0 | grad norm: 6.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.462 | TFLOPs: 41.90 | +[default7]: iteration 927/ 6200 | consumed samples: 949248 | consumed tokens: 1944059904 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.004807E+00 | loss scale: 1024.0 | grad norm: 5.822 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.104 | TFLOPs: 42.10 | +[default7]: iteration 928/ 6200 | consumed samples: 950272 | consumed tokens: 1946157056 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.012233E+00 | loss scale: 1024.0 | grad norm: 6.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.191 | TFLOPs: 42.12 | +[default7]: iteration 929/ 6200 | consumed samples: 951296 | consumed tokens: 1948254208 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.011177E+00 | loss scale: 1024.0 | grad norm: 5.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.176 | TFLOPs: 42.12 | +[default7]: iteration 930/ 6200 | consumed samples: 952320 | consumed tokens: 1950351360 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.016161E+00 | loss scale: 1024.0 | grad norm: 6.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.043 | TFLOPs: 42.08 | +[default7]: iteration 931/ 6200 | consumed samples: 953344 | consumed tokens: 1952448512 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.022784E+00 | loss scale: 1024.0 | grad norm: 5.815 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.256 | TFLOPs: 42.14 | +[default7]: iteration 932/ 6200 | consumed samples: 954368 | consumed tokens: 1954545664 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.027803E+00 | loss scale: 1024.0 | grad norm: 7.821 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.258 | TFLOPs: 42.14 | +[default7]: iteration 933/ 6200 | consumed samples: 955392 | consumed tokens: 1956642816 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.007641E+00 | loss scale: 1024.0 | grad norm: 6.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.121 | TFLOPs: 42.10 | +[default7]: iteration 934/ 6200 | consumed samples: 956416 | consumed tokens: 1958739968 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.003544E+00 | loss scale: 1024.0 | grad norm: 6.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.934 | TFLOPs: 42.04 | +[default7]: iteration 935/ 6200 | consumed samples: 957440 | consumed tokens: 1960837120 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.037965E+00 | loss scale: 1024.0 | grad norm: 7.033 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.279 | TFLOPs: 42.15 | +[default7]: iteration 936/ 6200 | consumed samples: 958464 | consumed tokens: 1962934272 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.024188E+00 | loss scale: 1024.0 | grad norm: 5.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.827 | TFLOPs: 42.01 | +[default7]: iteration 937/ 6200 | consumed samples: 959488 | consumed tokens: 1965031424 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.979726E+00 | loss scale: 1024.0 | grad norm: 6.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.110 | TFLOPs: 42.10 | +[default7]: iteration 938/ 6200 | consumed samples: 960512 | consumed tokens: 1967128576 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.046610E+00 | loss scale: 1024.0 | grad norm: 5.790 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.673 | TFLOPs: 41.96 | +[default7]: iteration 939/ 6200 | consumed samples: 961536 | consumed tokens: 1969225728 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.016706E+00 | loss scale: 1024.0 | grad norm: 6.007 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.785 | TFLOPs: 42.00 | +[default7]: iteration 940/ 6200 | consumed samples: 962560 | consumed tokens: 1971322880 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.023864E+00 | loss scale: 1024.0 | grad norm: 6.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.040 | TFLOPs: 42.08 | +[default7]: iteration 941/ 6200 | consumed samples: 963584 | consumed tokens: 1973420032 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.019055E+00 | loss scale: 1024.0 | grad norm: 5.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.025 | TFLOPs: 42.07 | +[default7]: iteration 942/ 6200 | consumed samples: 964608 | consumed tokens: 1975517184 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.003342E+00 | loss scale: 1024.0 | grad norm: 5.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.840 | TFLOPs: 42.01 | +[default7]: iteration 943/ 6200 | consumed samples: 965632 | consumed tokens: 1977614336 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.021190E+00 | loss scale: 1024.0 | grad norm: 6.722 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.445 | TFLOPs: 41.89 | +[default7]: iteration 944/ 6200 | consumed samples: 966656 | consumed tokens: 1979711488 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.015461E+00 | loss scale: 1024.0 | grad norm: 6.074 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.852 | TFLOPs: 42.02 | +[default7]: iteration 945/ 6200 | consumed samples: 967680 | consumed tokens: 1981808640 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.980794E+00 | loss scale: 1024.0 | grad norm: 5.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.963 | TFLOPs: 42.05 | +[default7]: iteration 946/ 6200 | consumed samples: 968704 | consumed tokens: 1983905792 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.008158E+00 | loss scale: 1024.0 | grad norm: 5.908 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.382 | TFLOPs: 42.18 | +[default7]: iteration 947/ 6200 | consumed samples: 969728 | consumed tokens: 1986002944 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.042222E+00 | loss scale: 1024.0 | grad norm: 5.686 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.208 | TFLOPs: 42.13 | +[default7]: iteration 948/ 6200 | consumed samples: 970752 | consumed tokens: 1988100096 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.003910E+00 | loss scale: 1024.0 | grad norm: 5.806 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.371 | TFLOPs: 42.18 | +[default7]: iteration 949/ 6200 | consumed samples: 971776 | consumed tokens: 1990197248 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.010395E+00 | loss scale: 1024.0 | grad norm: 5.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.974 | TFLOPs: 42.06 | +[default7]: iteration 950/ 6200 | consumed samples: 972800 | consumed tokens: 1992294400 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.993800E+00 | loss scale: 1024.0 | grad norm: 5.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.375 | TFLOPs: 41.87 | +[default7]: iteration 951/ 6200 | consumed samples: 973824 | consumed tokens: 1994391552 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.987177E+00 | loss scale: 1024.0 | grad norm: 7.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.032 | TFLOPs: 42.07 | +[default7]: iteration 952/ 6200 | consumed samples: 974848 | consumed tokens: 1996488704 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.035344E+00 | loss scale: 1024.0 | grad norm: 6.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.958 | TFLOPs: 42.05 | +[default7]: iteration 953/ 6200 | consumed samples: 975872 | consumed tokens: 1998585856 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.986498E+00 | loss scale: 1024.0 | grad norm: 5.735 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.733 | TFLOPs: 41.98 | +[default7]: iteration 954/ 6200 | consumed samples: 976896 | consumed tokens: 2000683008 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.014657E+00 | loss scale: 1024.0 | grad norm: 7.955 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.962 | TFLOPs: 41.75 | +[default7]: iteration 955/ 6200 | consumed samples: 977920 | consumed tokens: 2002780160 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.017039E+00 | loss scale: 1024.0 | grad norm: 6.643 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.316 | TFLOPs: 41.86 | +[default7]: iteration 956/ 6200 | consumed samples: 978944 | consumed tokens: 2004877312 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.020073E+00 | loss scale: 1024.0 | grad norm: 6.920 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.566 | TFLOPs: 42.24 | +[default7]: iteration 957/ 6200 | consumed samples: 979968 | consumed tokens: 2006974464 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.027374E+00 | loss scale: 1024.0 | grad norm: 7.923 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.586 | TFLOPs: 42.24 | +[default7]: iteration 958/ 6200 | consumed samples: 980992 | consumed tokens: 2009071616 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.001369E+00 | loss scale: 1024.0 | grad norm: 6.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.131 | TFLOPs: 42.10 | +[default7]: iteration 959/ 6200 | consumed samples: 982016 | consumed tokens: 2011168768 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.024883E+00 | loss scale: 1024.0 | grad norm: 7.605 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.197 | TFLOPs: 42.12 | +[default7]: iteration 960/ 6200 | consumed samples: 983040 | consumed tokens: 2013265920 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.001990E+00 | loss scale: 1024.0 | grad norm: 5.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.058 | TFLOPs: 42.08 | +[default7]: iteration 961/ 6200 | consumed samples: 984064 | consumed tokens: 2015363072 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.019768E+00 | loss scale: 1024.0 | grad norm: 6.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.044 | TFLOPs: 42.08 | +[default7]: iteration 962/ 6200 | consumed samples: 985088 | consumed tokens: 2017460224 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.028487E+00 | loss scale: 1024.0 | grad norm: 7.840 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.054 | TFLOPs: 42.08 | +[default7]: iteration 963/ 6200 | consumed samples: 986112 | consumed tokens: 2019557376 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.004797E+00 | loss scale: 1024.0 | grad norm: 6.712 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.975 | TFLOPs: 42.06 | +[default7]: iteration 964/ 6200 | consumed samples: 987136 | consumed tokens: 2021654528 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.022539E+00 | loss scale: 1024.0 | grad norm: 6.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.778 | TFLOPs: 42.00 | +[default7]: iteration 965/ 6200 | consumed samples: 988160 | consumed tokens: 2023751680 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.021399E+00 | loss scale: 1024.0 | grad norm: 5.805 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.014 | TFLOPs: 42.07 | +[default7]: iteration 966/ 6200 | consumed samples: 989184 | consumed tokens: 2025848832 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.010360E+00 | loss scale: 1024.0 | grad norm: 6.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.000 | TFLOPs: 42.06 | +[default7]: iteration 967/ 6200 | consumed samples: 990208 | consumed tokens: 2027945984 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.020622E+00 | loss scale: 1024.0 | grad norm: 7.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.551 | TFLOPs: 41.93 | +[default7]: iteration 968/ 6200 | consumed samples: 991232 | consumed tokens: 2030043136 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.988361E+00 | loss scale: 1024.0 | grad norm: 8.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.316 | TFLOPs: 42.16 | +[default7]: iteration 969/ 6200 | consumed samples: 992256 | consumed tokens: 2032140288 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.024349E+00 | loss scale: 1024.0 | grad norm: 6.060 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.837 | TFLOPs: 42.01 | +[default7]: iteration 970/ 6200 | consumed samples: 993280 | consumed tokens: 2034237440 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.015089E+00 | loss scale: 1024.0 | grad norm: 6.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.901 | TFLOPs: 42.03 | +[default7]: iteration 971/ 6200 | consumed samples: 994304 | consumed tokens: 2036334592 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.008984E+00 | loss scale: 1024.0 | grad norm: 7.028 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.008 | TFLOPs: 42.07 | +[default7]: iteration 972/ 6200 | consumed samples: 995328 | consumed tokens: 2038431744 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.010468E+00 | loss scale: 1024.0 | grad norm: 6.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.786 | TFLOPs: 42.00 | +[default7]: iteration 973/ 6200 | consumed samples: 996352 | consumed tokens: 2040528896 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.985830E+00 | loss scale: 1024.0 | grad norm: 5.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.372 | TFLOPs: 41.87 | +[default7]: iteration 974/ 6200 | consumed samples: 997376 | consumed tokens: 2042626048 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.007726E+00 | loss scale: 1024.0 | grad norm: 6.959 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.654 | TFLOPs: 41.96 | +[default7]: iteration 975/ 6200 | consumed samples: 998400 | consumed tokens: 2044723200 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.003658E+00 | loss scale: 1024.0 | grad norm: 7.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.885 | TFLOPs: 42.03 | +[default7]: iteration 976/ 6200 | consumed samples: 999424 | consumed tokens: 2046820352 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.993696E+00 | loss scale: 1024.0 | grad norm: 6.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.080 | TFLOPs: 42.09 | +[default7]: iteration 977/ 6200 | consumed samples: 1000448 | consumed tokens: 2048917504 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.014102E+00 | loss scale: 1024.0 | grad norm: 6.949 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.237 | TFLOPs: 42.14 | +[default7]: iteration 978/ 6200 | consumed samples: 1001472 | consumed tokens: 2051014656 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.011468E+00 | loss scale: 1024.0 | grad norm: 6.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.678 | TFLOPs: 41.97 | +[default7]: iteration 979/ 6200 | consumed samples: 1002496 | consumed tokens: 2053111808 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.999417E+00 | loss scale: 1024.0 | grad norm: 6.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.994 | TFLOPs: 42.06 | +[default7]: iteration 980/ 6200 | consumed samples: 1003520 | consumed tokens: 2055208960 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.002724E+00 | loss scale: 1024.0 | grad norm: 5.643 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.413 | TFLOPs: 41.88 | +[default7]: iteration 981/ 6200 | consumed samples: 1004544 | consumed tokens: 2057306112 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.012121E+00 | loss scale: 1024.0 | grad norm: 6.697 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.693 | TFLOPs: 41.97 | +[default7]: iteration 982/ 6200 | consumed samples: 1005568 | consumed tokens: 2059403264 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.000864E+00 | loss scale: 1024.0 | grad norm: 5.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.027 | TFLOPs: 42.07 | +[default7]: iteration 983/ 6200 | consumed samples: 1006592 | consumed tokens: 2061500416 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.016538E+00 | loss scale: 1024.0 | grad norm: 7.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.157 | TFLOPs: 42.11 | +[default7]: iteration 984/ 6200 | consumed samples: 1007616 | consumed tokens: 2063597568 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.025328E+00 | loss scale: 1024.0 | grad norm: 6.725 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.089 | TFLOPs: 42.09 | +[default7]: iteration 985/ 6200 | consumed samples: 1008640 | consumed tokens: 2065694720 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.012830E+00 | loss scale: 1024.0 | grad norm: 6.658 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.830 | TFLOPs: 42.01 | +[default7]: iteration 986/ 6200 | consumed samples: 1009664 | consumed tokens: 2067791872 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.018410E+00 | loss scale: 1024.0 | grad norm: 6.810 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.119 | TFLOPs: 42.10 | +[default7]: iteration 987/ 6200 | consumed samples: 1010688 | consumed tokens: 2069889024 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.017783E+00 | loss scale: 1024.0 | grad norm: 7.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.055 | TFLOPs: 42.08 | +[default7]: iteration 988/ 6200 | consumed samples: 1011712 | consumed tokens: 2071986176 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.010015E+00 | loss scale: 1024.0 | grad norm: 6.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.068 | TFLOPs: 42.08 | +[default7]: iteration 989/ 6200 | consumed samples: 1012736 | consumed tokens: 2074083328 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.012460E+00 | loss scale: 1024.0 | grad norm: 8.006 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.166 | TFLOPs: 42.11 | +[default7]: iteration 990/ 6200 | consumed samples: 1013760 | consumed tokens: 2076180480 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.023818E+00 | loss scale: 1024.0 | grad norm: 6.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.040 | TFLOPs: 42.08 | +[default7]: iteration 991/ 6200 | consumed samples: 1014784 | consumed tokens: 2078277632 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.989279E+00 | loss scale: 1024.0 | grad norm: 6.970 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.347 | TFLOPs: 42.17 | +[default7]: iteration 992/ 6200 | consumed samples: 1015808 | consumed tokens: 2080374784 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.992043E+00 | loss scale: 1024.0 | grad norm: 6.757 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.609 | TFLOPs: 42.25 | +[default7]: iteration 993/ 6200 | consumed samples: 1016832 | consumed tokens: 2082471936 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.002873E+00 | loss scale: 1024.0 | grad norm: 6.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.248 | TFLOPs: 42.14 | +[default7]: iteration 994/ 6200 | consumed samples: 1017856 | consumed tokens: 2084569088 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.989700E+00 | loss scale: 1024.0 | grad norm: 7.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.302 | TFLOPs: 42.16 | +[default7]: iteration 995/ 6200 | consumed samples: 1018880 | consumed tokens: 2086666240 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.029016E+00 | loss scale: 1024.0 | grad norm: 9.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.890 | TFLOPs: 42.03 | +[default7]: iteration 996/ 6200 | consumed samples: 1019904 | consumed tokens: 2088763392 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.012140E+00 | loss scale: 1024.0 | grad norm: 5.640 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.162 | TFLOPs: 42.11 | +[default7]: iteration 997/ 6200 | consumed samples: 1020928 | consumed tokens: 2090860544 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.988237E+00 | loss scale: 1024.0 | grad norm: 6.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.292 | TFLOPs: 42.15 | +[default7]: iteration 998/ 6200 | consumed samples: 1021952 | consumed tokens: 2092957696 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.994622E+00 | loss scale: 1024.0 | grad norm: 6.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.208 | TFLOPs: 42.13 | +[default7]: iteration 999/ 6200 | consumed samples: 1022976 | consumed tokens: 2095054848 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.004311E+00 | loss scale: 1024.0 | grad norm: 5.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.030 | TFLOPs: 42.07 | +[default7]: iteration 1000/ 6200 | consumed samples: 1024000 | consumed tokens: 2097152000 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.997857E+00 | loss scale: 1024.0 | grad norm: 6.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.793 | TFLOPs: 42.00 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 1000 | lm loss value: 3.428756E+00 | lm loss PPL: 3.083826E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 1000 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 14:52:20,850] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is begin to save! +[default0]:[2022-10-06 14:52:20,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_01-model_00-model_states.pt... +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 1000 | lm loss value: 1.904537E+00 | lm loss PPL: 6.716300E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:[2022-10-06 14:52:21,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,272] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,355] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,492] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,601] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,628] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,709] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,738] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,764] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,792] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,819] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,875] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,901] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,930] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,957] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_26-model_00-model_states.pt... +[default3]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 14:52:21,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 14:52:21,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 14:52:21,986] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/mp_rank_00_model_states.pt +[default0]:[2022-10-06 14:52:21,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 14:52:21,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 14:52:22,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 14:52:22,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 14:52:22,174] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 14:52:22,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 14:52:22,200] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 14:52:22,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 14:52:22,253] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 14:52:22,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 14:52:22,255] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 14:52:22,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 14:52:22,191] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 14:52:22,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 14:52:22,222] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 14:52:22,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 14:52:22,208] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 14:52:22,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 14:52:22,310] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 14:52:22,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 14:52:22,238] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 14:52:22,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 14:52:22,298] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 14:52:22,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 14:52:22,243] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 14:52:22,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 14:52:22,307] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 14:52:22,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 14:52:22,312] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 14:52:22,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 14:52:22,337] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 14:52:22,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 14:52:22,279] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 14:52:22,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 14:52:22,324] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 14:52:22,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 14:52:22,323] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 14:52:22,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 14:52:22,331] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 14:52:22,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 14:52:22,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 14:52:22,324] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 14:52:22,310] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 14:52:22,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 14:52:22,274] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 14:52:22,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 14:52:22,277] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 14:52:22,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 14:52:22,298] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 14:52:22,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 14:52:22,343] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 14:52:22,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 14:52:22,299] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 14:52:22,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 14:52:22,325] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 14:52:22,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 14:52:22,331] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 14:52:22,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 14:52:22,356] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default3]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default5]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default0]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 14:52:22,430] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default7]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default7]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default1]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default0]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default0]: successfully saved checkpoint at iteration 1000 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default4]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default6]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default4]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default4]:[2022-10-06 14:52:22,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 14:52:22,392] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default0]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default3]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default2]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default2]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default5]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default0]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default1]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default2]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default7]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default6]:[2022-10-06 14:52:22,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 14:52:22,413] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default5]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default5]:[2022-10-06 14:52:22,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 14:52:22,399] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1000/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default2]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default6]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default1]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default3]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default1]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default6]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default7]:[2022-10-06 14:52:22,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default7]:time (ms) | save-checkpoint: 1581.56 +[default4]:[2022-10-06 14:52:22,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +[default7]: iteration 1001/ 6200 | consumed samples: 1025024 | consumed tokens: 2099249152 | elapsed time per iteration (s): 53.65 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.016525E+00 | loss scale: 1024.0 | grad norm: 6.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.086 | TFLOPs: 5.82 | +[default7]: iteration 1002/ 6200 | consumed samples: 1026048 | consumed tokens: 2101346304 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.013674E+00 | loss scale: 1024.0 | grad norm: 6.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.018 | TFLOPs: 42.07 | +[default7]: iteration 1003/ 6200 | consumed samples: 1027072 | consumed tokens: 2103443456 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.003035E+00 | loss scale: 1024.0 | grad norm: 6.900 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.966 | TFLOPs: 42.05 | +[default7]: iteration 1004/ 6200 | consumed samples: 1028096 | consumed tokens: 2105540608 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.001252E+00 | loss scale: 2048.0 | grad norm: 2.906 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.923 | TFLOPs: 42.04 | +[default7]: iteration 1005/ 6200 | consumed samples: 1029120 | consumed tokens: 2107637760 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.004643E+00 | loss scale: 2048.0 | grad norm: 6.067 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.088 | TFLOPs: 42.09 | +[default7]: iteration 1006/ 6200 | consumed samples: 1030144 | consumed tokens: 2109734912 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.994887E+00 | loss scale: 2048.0 | grad norm: 5.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.494 | TFLOPs: 42.21 | +[default7]: iteration 1007/ 6200 | consumed samples: 1031168 | consumed tokens: 2111832064 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.994968E+00 | loss scale: 2048.0 | grad norm: 7.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.243 | TFLOPs: 42.14 | +[default7]: iteration 1008/ 6200 | consumed samples: 1032192 | consumed tokens: 2113929216 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.005630E+00 | loss scale: 2048.0 | grad norm: 6.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.985 | TFLOPs: 42.06 | +[default7]: iteration 1009/ 6200 | consumed samples: 1033216 | consumed tokens: 2116026368 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.969299E+00 | loss scale: 2048.0 | grad norm: 8.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.984 | TFLOPs: 42.06 | +[default7]: iteration 1010/ 6200 | consumed samples: 1034240 | consumed tokens: 2118123520 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.970393E+00 | loss scale: 2048.0 | grad norm: 6.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.138 | TFLOPs: 42.11 | +[default7]: iteration 1011/ 6200 | consumed samples: 1035264 | consumed tokens: 2120220672 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.003419E+00 | loss scale: 2048.0 | grad norm: 5.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.364 | TFLOPs: 42.17 | +[default7]: iteration 1012/ 6200 | consumed samples: 1036288 | consumed tokens: 2122317824 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.996396E+00 | loss scale: 2048.0 | grad norm: 7.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.017 | TFLOPs: 42.07 | +[default7]: iteration 1013/ 6200 | consumed samples: 1037312 | consumed tokens: 2124414976 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.961468E+00 | loss scale: 2048.0 | grad norm: 6.830 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.128 | TFLOPs: 42.10 | +[default7]: iteration 1014/ 6200 | consumed samples: 1038336 | consumed tokens: 2126512128 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.016210E+00 | loss scale: 2048.0 | grad norm: 6.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.026 | TFLOPs: 42.07 | +[default7]: iteration 1015/ 6200 | consumed samples: 1039360 | consumed tokens: 2128609280 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.981871E+00 | loss scale: 2048.0 | grad norm: 5.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.218 | TFLOPs: 42.13 | +[default7]: iteration 1016/ 6200 | consumed samples: 1040384 | consumed tokens: 2130706432 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.007174E+00 | loss scale: 2048.0 | grad norm: 6.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.076 | TFLOPs: 42.09 | +[default7]: iteration 1017/ 6200 | consumed samples: 1041408 | consumed tokens: 2132803584 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.012754E+00 | loss scale: 2048.0 | grad norm: 6.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.818 | TFLOPs: 42.01 | +[default7]: iteration 1018/ 6200 | consumed samples: 1042432 | consumed tokens: 2134900736 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.990978E+00 | loss scale: 2048.0 | grad norm: 6.725 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.743 | TFLOPs: 41.99 | +[default7]: iteration 1019/ 6200 | consumed samples: 1043456 | consumed tokens: 2136997888 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.007685E+00 | loss scale: 2048.0 | grad norm: 5.872 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.066 | TFLOPs: 42.08 | +[default7]: iteration 1020/ 6200 | consumed samples: 1044480 | consumed tokens: 2139095040 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.973744E+00 | loss scale: 2048.0 | grad norm: 5.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.934 | TFLOPs: 42.04 | +[default7]: iteration 1021/ 6200 | consumed samples: 1045504 | consumed tokens: 2141192192 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.990282E+00 | loss scale: 2048.0 | grad norm: 5.912 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.051 | TFLOPs: 42.08 | +[default7]: iteration 1022/ 6200 | consumed samples: 1046528 | consumed tokens: 2143289344 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.000837E+00 | loss scale: 2048.0 | grad norm: 6.890 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.925 | TFLOPs: 42.04 | +[default7]: iteration 1023/ 6200 | consumed samples: 1047552 | consumed tokens: 2145386496 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.010783E+00 | loss scale: 2048.0 | grad norm: 6.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.940 | TFLOPs: 42.05 | +[default7]: iteration 1024/ 6200 | consumed samples: 1048576 | consumed tokens: 2147483648 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.992851E+00 | loss scale: 2048.0 | grad norm: 7.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.270 | TFLOPs: 42.15 | +[default7]: iteration 1025/ 6200 | consumed samples: 1049600 | consumed tokens: 2149580800 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.035345E+00 | loss scale: 2048.0 | grad norm: 7.055 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.949 | TFLOPs: 42.05 | +[default7]: iteration 1026/ 6200 | consumed samples: 1050624 | consumed tokens: 2151677952 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.984658E+00 | loss scale: 2048.0 | grad norm: 7.877 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.064 | TFLOPs: 42.08 | +[default7]: iteration 1027/ 6200 | consumed samples: 1051648 | consumed tokens: 2153775104 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.007132E+00 | loss scale: 2048.0 | grad norm: 8.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.664 | TFLOPs: 41.96 | +[default7]: iteration 1028/ 6200 | consumed samples: 1052672 | consumed tokens: 2155872256 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.010857E+00 | loss scale: 2048.0 | grad norm: 7.060 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.692 | TFLOPs: 41.97 | +[default7]: iteration 1029/ 6200 | consumed samples: 1053696 | consumed tokens: 2157969408 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.990953E+00 | loss scale: 2048.0 | grad norm: 7.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.735 | TFLOPs: 41.98 | +[default7]: iteration 1030/ 6200 | consumed samples: 1054720 | consumed tokens: 2160066560 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.012243E+00 | loss scale: 2048.0 | grad norm: 12.918 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.838 | TFLOPs: 42.01 | +[default7]: iteration 1031/ 6200 | consumed samples: 1055744 | consumed tokens: 2162163712 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.012351E+00 | loss scale: 2048.0 | grad norm: 8.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.155 | TFLOPs: 42.11 | +[default7]: iteration 1032/ 6200 | consumed samples: 1056768 | consumed tokens: 2164260864 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.999052E+00 | loss scale: 2048.0 | grad norm: 6.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.933 | TFLOPs: 42.04 | +[default7]: iteration 1033/ 6200 | consumed samples: 1057792 | consumed tokens: 2166358016 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.995099E+00 | loss scale: 2048.0 | grad norm: 7.885 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.239 | TFLOPs: 42.14 | +[default7]: iteration 1034/ 6200 | consumed samples: 1058816 | consumed tokens: 2168455168 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.990845E+00 | loss scale: 2048.0 | grad norm: 8.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.713 | TFLOPs: 41.98 | +[default7]: iteration 1035/ 6200 | consumed samples: 1059840 | consumed tokens: 2170552320 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.000091E+00 | loss scale: 2048.0 | grad norm: 5.940 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.137 | TFLOPs: 42.11 | +[default7]: iteration 1036/ 6200 | consumed samples: 1060864 | consumed tokens: 2172649472 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.987937E+00 | loss scale: 2048.0 | grad norm: 6.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.235 | TFLOPs: 42.14 | +[default7]: iteration 1037/ 6200 | consumed samples: 1061888 | consumed tokens: 2174746624 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.011727E+00 | loss scale: 2048.0 | grad norm: 7.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.115 | TFLOPs: 42.10 | +[default7]: iteration 1038/ 6200 | consumed samples: 1062912 | consumed tokens: 2176843776 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.989839E+00 | loss scale: 2048.0 | grad norm: 6.869 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.172 | TFLOPs: 42.12 | +[default7]: iteration 1039/ 6200 | consumed samples: 1063936 | consumed tokens: 2178940928 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.011138E+00 | loss scale: 2048.0 | grad norm: 5.791 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.811 | TFLOPs: 42.01 | +[default7]: iteration 1040/ 6200 | consumed samples: 1064960 | consumed tokens: 2181038080 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.988014E+00 | loss scale: 2048.0 | grad norm: 6.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.972 | TFLOPs: 42.05 | +[default7]: iteration 1041/ 6200 | consumed samples: 1065984 | consumed tokens: 2183135232 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.997311E+00 | loss scale: 2048.0 | grad norm: 7.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.116 | TFLOPs: 42.10 | +[default7]: iteration 1042/ 6200 | consumed samples: 1067008 | consumed tokens: 2185232384 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.984024E+00 | loss scale: 2048.0 | grad norm: 6.819 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.960 | TFLOPs: 42.05 | +[default7]: iteration 1043/ 6200 | consumed samples: 1068032 | consumed tokens: 2187329536 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.963869E+00 | loss scale: 2048.0 | grad norm: 5.963 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.744 | TFLOPs: 41.99 | +[default7]: iteration 1044/ 6200 | consumed samples: 1069056 | consumed tokens: 2189426688 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.994664E+00 | loss scale: 2048.0 | grad norm: 6.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.009 | TFLOPs: 42.07 | +[default7]: iteration 1045/ 6200 | consumed samples: 1070080 | consumed tokens: 2191523840 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.988082E+00 | loss scale: 2048.0 | grad norm: 6.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.334 | TFLOPs: 42.17 | +[default7]: iteration 1046/ 6200 | consumed samples: 1071104 | consumed tokens: 2193620992 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.993279E+00 | loss scale: 2048.0 | grad norm: 5.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.729 | TFLOPs: 41.98 | +[default7]: iteration 1047/ 6200 | consumed samples: 1072128 | consumed tokens: 2195718144 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.999368E+00 | loss scale: 2048.0 | grad norm: 5.573 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.116 | TFLOPs: 42.10 | +[default7]: iteration 1048/ 6200 | consumed samples: 1073152 | consumed tokens: 2197815296 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.998365E+00 | loss scale: 2048.0 | grad norm: 7.032 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.948 | TFLOPs: 42.05 | +[default7]: iteration 1049/ 6200 | consumed samples: 1074176 | consumed tokens: 2199912448 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.985515E+00 | loss scale: 2048.0 | grad norm: 6.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.002 | TFLOPs: 42.06 | +[default7]: iteration 1050/ 6200 | consumed samples: 1075200 | consumed tokens: 2202009600 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.970340E+00 | loss scale: 2048.0 | grad norm: 6.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.087 | TFLOPs: 42.09 | +[default7]: iteration 1051/ 6200 | consumed samples: 1076224 | consumed tokens: 2204106752 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.007383E+00 | loss scale: 2048.0 | grad norm: 7.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.611 | TFLOPs: 41.94 | +[default7]: iteration 1052/ 6200 | consumed samples: 1077248 | consumed tokens: 2206203904 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.991418E+00 | loss scale: 2048.0 | grad norm: 6.756 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.332 | TFLOPs: 42.16 | +[default7]: iteration 1053/ 6200 | consumed samples: 1078272 | consumed tokens: 2208301056 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.995827E+00 | loss scale: 2048.0 | grad norm: 6.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.138 | TFLOPs: 42.11 | +[default7]: iteration 1054/ 6200 | consumed samples: 1079296 | consumed tokens: 2210398208 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.001658E+00 | loss scale: 2048.0 | grad norm: 5.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.370 | TFLOPs: 42.18 | +[default7]: iteration 1055/ 6200 | consumed samples: 1080320 | consumed tokens: 2212495360 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.990254E+00 | loss scale: 2048.0 | grad norm: 6.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.783 | TFLOPs: 42.00 | +[default7]: iteration 1056/ 6200 | consumed samples: 1081344 | consumed tokens: 2214592512 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.978772E+00 | loss scale: 2048.0 | grad norm: 8.918 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.877 | TFLOPs: 42.03 | +[default7]: iteration 1057/ 6200 | consumed samples: 1082368 | consumed tokens: 2216689664 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.975211E+00 | loss scale: 2048.0 | grad norm: 5.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.053 | TFLOPs: 42.08 | +[default7]: iteration 1058/ 6200 | consumed samples: 1083392 | consumed tokens: 2218786816 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.996332E+00 | loss scale: 2048.0 | grad norm: 6.011 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.971 | TFLOPs: 42.05 | +[default7]: iteration 1059/ 6200 | consumed samples: 1084416 | consumed tokens: 2220883968 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.994507E+00 | loss scale: 2048.0 | grad norm: 6.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.120 | TFLOPs: 42.10 | +[default7]: iteration 1060/ 6200 | consumed samples: 1085440 | consumed tokens: 2222981120 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.983929E+00 | loss scale: 2048.0 | grad norm: 7.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.843 | TFLOPs: 42.02 | +[default7]: iteration 1061/ 6200 | consumed samples: 1086464 | consumed tokens: 2225078272 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.020211E+00 | loss scale: 2048.0 | grad norm: 6.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.197 | TFLOPs: 42.12 | +[default7]: iteration 1062/ 6200 | consumed samples: 1087488 | consumed tokens: 2227175424 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.962127E+00 | loss scale: 2048.0 | grad norm: 6.859 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.061 | TFLOPs: 42.08 | +[default7]: iteration 1063/ 6200 | consumed samples: 1088512 | consumed tokens: 2229272576 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.013000E+00 | loss scale: 2048.0 | grad norm: 5.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.034 | TFLOPs: 42.07 | +[default7]: iteration 1064/ 6200 | consumed samples: 1089536 | consumed tokens: 2231369728 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.980192E+00 | loss scale: 2048.0 | grad norm: 7.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.408 | TFLOPs: 42.19 | +[default7]: iteration 1065/ 6200 | consumed samples: 1090560 | consumed tokens: 2233466880 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.012865E+00 | loss scale: 2048.0 | grad norm: 7.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.329 | TFLOPs: 42.16 | +[default7]: iteration 1066/ 6200 | consumed samples: 1091584 | consumed tokens: 2235564032 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.996614E+00 | loss scale: 2048.0 | grad norm: 5.715 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.001 | TFLOPs: 42.06 | +[default7]: iteration 1067/ 6200 | consumed samples: 1092608 | consumed tokens: 2237661184 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.992125E+00 | loss scale: 2048.0 | grad norm: 6.625 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.299 | TFLOPs: 42.15 | +[default7]: iteration 1068/ 6200 | consumed samples: 1093632 | consumed tokens: 2239758336 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.973685E+00 | loss scale: 2048.0 | grad norm: 5.848 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.511 | TFLOPs: 42.22 | +[default7]: iteration 1069/ 6200 | consumed samples: 1094656 | consumed tokens: 2241855488 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.961220E+00 | loss scale: 2048.0 | grad norm: 5.845 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.087 | TFLOPs: 42.09 | +[default7]: iteration 1070/ 6200 | consumed samples: 1095680 | consumed tokens: 2243952640 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.015249E+00 | loss scale: 2048.0 | grad norm: 5.734 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.447 | TFLOPs: 42.20 | +[default7]: iteration 1071/ 6200 | consumed samples: 1096704 | consumed tokens: 2246049792 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.996826E+00 | loss scale: 2048.0 | grad norm: 6.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.102 | TFLOPs: 42.09 | +[default7]: iteration 1072/ 6200 | consumed samples: 1097728 | consumed tokens: 2248146944 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.984317E+00 | loss scale: 2048.0 | grad norm: 6.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.200 | TFLOPs: 42.12 | +[default7]: iteration 1073/ 6200 | consumed samples: 1098752 | consumed tokens: 2250244096 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.970593E+00 | loss scale: 2048.0 | grad norm: 6.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.322 | TFLOPs: 42.16 | +[default7]: iteration 1074/ 6200 | consumed samples: 1099776 | consumed tokens: 2252341248 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.978526E+00 | loss scale: 2048.0 | grad norm: 6.071 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.137 | TFLOPs: 42.11 | +[default7]: iteration 1075/ 6200 | consumed samples: 1100800 | consumed tokens: 2254438400 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.978732E+00 | loss scale: 2048.0 | grad norm: 5.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.180 | TFLOPs: 42.12 | +[default7]: iteration 1076/ 6200 | consumed samples: 1101824 | consumed tokens: 2256535552 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.990129E+00 | loss scale: 2048.0 | grad norm: 6.702 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.608 | TFLOPs: 42.25 | +[default7]: iteration 1077/ 6200 | consumed samples: 1102848 | consumed tokens: 2258632704 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.953811E+00 | loss scale: 2048.0 | grad norm: 9.046 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.525 | TFLOPs: 42.22 | +[default7]: iteration 1078/ 6200 | consumed samples: 1103872 | consumed tokens: 2260729856 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.987535E+00 | loss scale: 2048.0 | grad norm: 6.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.250 | TFLOPs: 42.14 | +[default7]: iteration 1079/ 6200 | consumed samples: 1104896 | consumed tokens: 2262827008 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.993552E+00 | loss scale: 2048.0 | grad norm: 5.966 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.531 | TFLOPs: 42.23 | +[default7]: iteration 1080/ 6200 | consumed samples: 1105920 | consumed tokens: 2264924160 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.006916E+00 | loss scale: 2048.0 | grad norm: 7.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.931 | TFLOPs: 42.04 | +[default7]: iteration 1081/ 6200 | consumed samples: 1106944 | consumed tokens: 2267021312 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.006324E+00 | loss scale: 2048.0 | grad norm: 5.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.825 | TFLOPs: 42.01 | +[default7]: iteration 1082/ 6200 | consumed samples: 1107968 | consumed tokens: 2269118464 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.988908E+00 | loss scale: 2048.0 | grad norm: 5.910 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.023 | TFLOPs: 42.07 | +[default7]: iteration 1083/ 6200 | consumed samples: 1108992 | consumed tokens: 2271215616 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.990943E+00 | loss scale: 2048.0 | grad norm: 5.913 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.953 | TFLOPs: 42.05 | +[default7]: iteration 1084/ 6200 | consumed samples: 1110016 | consumed tokens: 2273312768 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.998300E+00 | loss scale: 2048.0 | grad norm: 5.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.209 | TFLOPs: 42.13 | +[default7]: iteration 1085/ 6200 | consumed samples: 1111040 | consumed tokens: 2275409920 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.972156E+00 | loss scale: 2048.0 | grad norm: 6.772 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.565 | TFLOPs: 41.93 | +[default7]: iteration 1086/ 6200 | consumed samples: 1112064 | consumed tokens: 2277507072 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.994031E+00 | loss scale: 2048.0 | grad norm: 6.046 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.525 | TFLOPs: 41.92 | +[default7]: iteration 1087/ 6200 | consumed samples: 1113088 | consumed tokens: 2279604224 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.998786E+00 | loss scale: 2048.0 | grad norm: 5.733 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.487 | TFLOPs: 41.91 | +[default7]: iteration 1088/ 6200 | consumed samples: 1114112 | consumed tokens: 2281701376 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.968537E+00 | loss scale: 2048.0 | grad norm: 6.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.905 | TFLOPs: 42.03 | +[default7]: iteration 1089/ 6200 | consumed samples: 1115136 | consumed tokens: 2283798528 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.977238E+00 | loss scale: 2048.0 | grad norm: 5.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.809 | TFLOPs: 42.01 | +[default7]: iteration 1090/ 6200 | consumed samples: 1116160 | consumed tokens: 2285895680 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.986367E+00 | loss scale: 2048.0 | grad norm: 10.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.750 | TFLOPs: 41.99 | +[default7]: iteration 1091/ 6200 | consumed samples: 1117184 | consumed tokens: 2287992832 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.005096E+00 | loss scale: 2048.0 | grad norm: 7.944 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.073 | TFLOPs: 42.09 | +[default7]: iteration 1092/ 6200 | consumed samples: 1118208 | consumed tokens: 2290089984 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.977026E+00 | loss scale: 2048.0 | grad norm: 6.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.155 | TFLOPs: 42.11 | +[default7]: iteration 1093/ 6200 | consumed samples: 1119232 | consumed tokens: 2292187136 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.992320E+00 | loss scale: 2048.0 | grad norm: 6.632 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.797 | TFLOPs: 42.00 | +[default7]: iteration 1094/ 6200 | consumed samples: 1120256 | consumed tokens: 2294284288 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.998244E+00 | loss scale: 2048.0 | grad norm: 11.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.575 | TFLOPs: 41.93 | +[default7]: iteration 1095/ 6200 | consumed samples: 1121280 | consumed tokens: 2296381440 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.956743E+00 | loss scale: 2048.0 | grad norm: 7.045 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.378 | TFLOPs: 41.87 | +[default7]: iteration 1096/ 6200 | consumed samples: 1122304 | consumed tokens: 2298478592 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.995006E+00 | loss scale: 2048.0 | grad norm: 6.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.737 | TFLOPs: 41.98 | +[default7]: iteration 1097/ 6200 | consumed samples: 1123328 | consumed tokens: 2300575744 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.989068E+00 | loss scale: 2048.0 | grad norm: 6.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.907 | TFLOPs: 42.04 | +[default7]: iteration 1098/ 6200 | consumed samples: 1124352 | consumed tokens: 2302672896 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.984686E+00 | loss scale: 2048.0 | grad norm: 6.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.183 | TFLOPs: 41.81 | +[default7]: iteration 1099/ 6200 | consumed samples: 1125376 | consumed tokens: 2304770048 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.994362E+00 | loss scale: 2048.0 | grad norm: 5.676 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.639 | TFLOPs: 41.95 | +[default7]: iteration 1100/ 6200 | consumed samples: 1126400 | consumed tokens: 2306867200 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.985581E+00 | loss scale: 2048.0 | grad norm: 5.958 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.975 | TFLOPs: 41.75 | +[default7]: iteration 1101/ 6200 | consumed samples: 1127424 | consumed tokens: 2308964352 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.978959E+00 | loss scale: 2048.0 | grad norm: 5.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.381 | TFLOPs: 41.87 | +[default7]: iteration 1102/ 6200 | consumed samples: 1128448 | consumed tokens: 2311061504 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.969443E+00 | loss scale: 2048.0 | grad norm: 6.743 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.762 | TFLOPs: 41.99 | +[default7]: iteration 1103/ 6200 | consumed samples: 1129472 | consumed tokens: 2313158656 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.987130E+00 | loss scale: 2048.0 | grad norm: 7.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.440 | TFLOPs: 41.89 | +[default7]: iteration 1104/ 6200 | consumed samples: 1130496 | consumed tokens: 2315255808 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.988231E+00 | loss scale: 2048.0 | grad norm: 5.821 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.619 | TFLOPs: 41.95 | +[default7]: iteration 1105/ 6200 | consumed samples: 1131520 | consumed tokens: 2317352960 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.970409E+00 | loss scale: 2048.0 | grad norm: 6.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.004 | TFLOPs: 41.76 | +[default7]: iteration 1106/ 6200 | consumed samples: 1132544 | consumed tokens: 2319450112 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.973618E+00 | loss scale: 2048.0 | grad norm: 6.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.768 | TFLOPs: 41.69 | +[default7]: iteration 1107/ 6200 | consumed samples: 1133568 | consumed tokens: 2321547264 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.980635E+00 | loss scale: 2048.0 | grad norm: 5.606 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.595 | TFLOPs: 41.64 | +[default7]: iteration 1108/ 6200 | consumed samples: 1134592 | consumed tokens: 2323644416 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.974141E+00 | loss scale: 2048.0 | grad norm: 6.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.683 | TFLOPs: 41.66 | +[default7]: iteration 1109/ 6200 | consumed samples: 1135616 | consumed tokens: 2325741568 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.002083E+00 | loss scale: 2048.0 | grad norm: 5.725 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.477 | TFLOPs: 41.60 | +[default7]: iteration 1110/ 6200 | consumed samples: 1136640 | consumed tokens: 2327838720 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.982537E+00 | loss scale: 2048.0 | grad norm: 6.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.413 | TFLOPs: 41.88 | +[default7]: iteration 1111/ 6200 | consumed samples: 1137664 | consumed tokens: 2329935872 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.987775E+00 | loss scale: 2048.0 | grad norm: 7.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.183 | TFLOPs: 41.81 | +[default7]: iteration 1112/ 6200 | consumed samples: 1138688 | consumed tokens: 2332033024 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.001901E+00 | loss scale: 2048.0 | grad norm: 6.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.945 | TFLOPs: 41.74 | +[default7]: iteration 1113/ 6200 | consumed samples: 1139712 | consumed tokens: 2334130176 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.998868E+00 | loss scale: 2048.0 | grad norm: 6.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.249 | TFLOPs: 41.83 | +[default7]: iteration 1114/ 6200 | consumed samples: 1140736 | consumed tokens: 2336227328 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.968360E+00 | loss scale: 2048.0 | grad norm: 6.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.359 | TFLOPs: 41.87 | +[default7]: iteration 1115/ 6200 | consumed samples: 1141760 | consumed tokens: 2338324480 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.958898E+00 | loss scale: 2048.0 | grad norm: 6.836 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.626 | TFLOPs: 41.95 | +[default7]: iteration 1116/ 6200 | consumed samples: 1142784 | consumed tokens: 2340421632 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.978367E+00 | loss scale: 2048.0 | grad norm: 5.542 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.740 | TFLOPs: 41.98 | +[default7]: iteration 1117/ 6200 | consumed samples: 1143808 | consumed tokens: 2342518784 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.995176E+00 | loss scale: 2048.0 | grad norm: 6.619 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.616 | TFLOPs: 41.95 | +[default7]: iteration 1118/ 6200 | consumed samples: 1144832 | consumed tokens: 2344615936 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.995588E+00 | loss scale: 2048.0 | grad norm: 5.969 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.316 | TFLOPs: 41.85 | +[default7]: iteration 1119/ 6200 | consumed samples: 1145856 | consumed tokens: 2346713088 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.957033E+00 | loss scale: 2048.0 | grad norm: 6.919 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.165 | TFLOPs: 41.81 | +[default7]: iteration 1120/ 6200 | consumed samples: 1146880 | consumed tokens: 2348810240 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.993183E+00 | loss scale: 2048.0 | grad norm: 6.848 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.020 | TFLOPs: 42.07 | +[default7]: iteration 1121/ 6200 | consumed samples: 1147904 | consumed tokens: 2350907392 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.994615E+00 | loss scale: 2048.0 | grad norm: 6.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.703 | TFLOPs: 41.97 | +[default7]: iteration 1122/ 6200 | consumed samples: 1148928 | consumed tokens: 2353004544 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.975821E+00 | loss scale: 2048.0 | grad norm: 6.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.416 | TFLOPs: 42.19 | +[default7]: iteration 1123/ 6200 | consumed samples: 1149952 | consumed tokens: 2355101696 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.991257E+00 | loss scale: 2048.0 | grad norm: 7.635 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.990 | TFLOPs: 42.37 | +[default7]: iteration 1124/ 6200 | consumed samples: 1150976 | consumed tokens: 2357198848 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.983926E+00 | loss scale: 2048.0 | grad norm: 6.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.255 | TFLOPs: 42.14 | +[default7]: iteration 1125/ 6200 | consumed samples: 1152000 | consumed tokens: 2359296000 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.955291E+00 | loss scale: 2048.0 | grad norm: 5.804 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.773 | TFLOPs: 42.30 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 1125 | lm loss value: 3.429139E+00 | lm loss PPL: 3.085007E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 1125 | lm loss value: 1.880867E+00 | lm loss PPL: 6.559189E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 1126/ 6200 | consumed samples: 1153024 | consumed tokens: 2361393152 | elapsed time per iteration (s): 51.85 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.997098E+00 | loss scale: 2048.0 | grad norm: 6.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.749 | TFLOPs: 6.02 | +[default7]: iteration 1127/ 6200 | consumed samples: 1154048 | consumed tokens: 2363490304 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.970949E+00 | loss scale: 2048.0 | grad norm: 5.932 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.205 | TFLOPs: 42.13 | +[default7]: iteration 1128/ 6200 | consumed samples: 1155072 | consumed tokens: 2365587456 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.991598E+00 | loss scale: 2048.0 | grad norm: 6.655 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.153 | TFLOPs: 42.11 | +[default7]: iteration 1129/ 6200 | consumed samples: 1156096 | consumed tokens: 2367684608 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.967140E+00 | loss scale: 2048.0 | grad norm: 6.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.176 | TFLOPs: 42.12 | +[default7]: iteration 1130/ 6200 | consumed samples: 1157120 | consumed tokens: 2369781760 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.980505E+00 | loss scale: 2048.0 | grad norm: 8.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.965 | TFLOPs: 42.05 | +[default7]: iteration 1131/ 6200 | consumed samples: 1158144 | consumed tokens: 2371878912 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.972604E+00 | loss scale: 2048.0 | grad norm: 6.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.560 | TFLOPs: 41.93 | +[default7]: iteration 1132/ 6200 | consumed samples: 1159168 | consumed tokens: 2373976064 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.954388E+00 | loss scale: 2048.0 | grad norm: 7.577 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.296 | TFLOPs: 42.15 | +[default7]: iteration 1133/ 6200 | consumed samples: 1160192 | consumed tokens: 2376073216 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.986639E+00 | loss scale: 2048.0 | grad norm: 6.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.704 | TFLOPs: 41.97 | +[default7]: iteration 1134/ 6200 | consumed samples: 1161216 | consumed tokens: 2378170368 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.979154E+00 | loss scale: 2048.0 | grad norm: 8.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.910 | TFLOPs: 42.04 | +[default7]: iteration 1135/ 6200 | consumed samples: 1162240 | consumed tokens: 2380267520 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.961941E+00 | loss scale: 2048.0 | grad norm: 9.765 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.813 | TFLOPs: 42.01 | +[default7]: iteration 1136/ 6200 | consumed samples: 1163264 | consumed tokens: 2382364672 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.971416E+00 | loss scale: 2048.0 | grad norm: 7.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.489 | TFLOPs: 41.91 | +[default7]: iteration 1137/ 6200 | consumed samples: 1164288 | consumed tokens: 2384461824 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.968657E+00 | loss scale: 2048.0 | grad norm: 5.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.664 | TFLOPs: 41.96 | +[default7]: iteration 1138/ 6200 | consumed samples: 1165312 | consumed tokens: 2386558976 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.959768E+00 | loss scale: 2048.0 | grad norm: 6.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.541 | TFLOPs: 41.92 | +[default7]: iteration 1139/ 6200 | consumed samples: 1166336 | consumed tokens: 2388656128 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.987388E+00 | loss scale: 2048.0 | grad norm: 6.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.251 | TFLOPs: 42.14 | +[default7]: iteration 1140/ 6200 | consumed samples: 1167360 | consumed tokens: 2390753280 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.997286E+00 | loss scale: 2048.0 | grad norm: 6.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.692 | TFLOPs: 42.27 | +[default7]: iteration 1141/ 6200 | consumed samples: 1168384 | consumed tokens: 2392850432 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.964715E+00 | loss scale: 2048.0 | grad norm: 6.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.433 | TFLOPs: 42.20 | +[default7]: iteration 1142/ 6200 | consumed samples: 1169408 | consumed tokens: 2394947584 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.982138E+00 | loss scale: 2048.0 | grad norm: 7.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.493 | TFLOPs: 42.21 | +[default7]: iteration 1143/ 6200 | consumed samples: 1170432 | consumed tokens: 2397044736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.959660E+00 | loss scale: 2048.0 | grad norm: 6.925 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.618 | TFLOPs: 42.25 | +[default7]: iteration 1144/ 6200 | consumed samples: 1171456 | consumed tokens: 2399141888 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.981637E+00 | loss scale: 2048.0 | grad norm: 7.791 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.003 | TFLOPs: 42.06 | +[default7]: iteration 1145/ 6200 | consumed samples: 1172480 | consumed tokens: 2401239040 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.947283E+00 | loss scale: 2048.0 | grad norm: 5.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.237 | TFLOPs: 42.14 | +[default7]: iteration 1146/ 6200 | consumed samples: 1173504 | consumed tokens: 2403336192 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.951486E+00 | loss scale: 2048.0 | grad norm: 5.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.508 | TFLOPs: 42.22 | +[default7]: iteration 1147/ 6200 | consumed samples: 1174528 | consumed tokens: 2405433344 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.976059E+00 | loss scale: 2048.0 | grad norm: 5.997 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.472 | TFLOPs: 42.21 | +[default7]: iteration 1148/ 6200 | consumed samples: 1175552 | consumed tokens: 2407530496 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.011375E+00 | loss scale: 2048.0 | grad norm: 6.019 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.528 | TFLOPs: 42.22 | +[default7]: iteration 1149/ 6200 | consumed samples: 1176576 | consumed tokens: 2409627648 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.979007E+00 | loss scale: 2048.0 | grad norm: 6.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.358 | TFLOPs: 42.17 | +[default7]: iteration 1150/ 6200 | consumed samples: 1177600 | consumed tokens: 2411724800 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.982891E+00 | loss scale: 2048.0 | grad norm: 5.687 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.484 | TFLOPs: 42.21 | +[default7]: iteration 1151/ 6200 | consumed samples: 1178624 | consumed tokens: 2413821952 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.983566E+00 | loss scale: 2048.0 | grad norm: 6.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.394 | TFLOPs: 42.18 | +[default7]: iteration 1152/ 6200 | consumed samples: 1179648 | consumed tokens: 2415919104 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.936021E+00 | loss scale: 2048.0 | grad norm: 6.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.497 | TFLOPs: 42.22 | +[default7]: iteration 1153/ 6200 | consumed samples: 1180672 | consumed tokens: 2418016256 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.975377E+00 | loss scale: 2048.0 | grad norm: 6.895 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.586 | TFLOPs: 42.24 | +[default7]: iteration 1154/ 6200 | consumed samples: 1181696 | consumed tokens: 2420113408 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.966654E+00 | loss scale: 2048.0 | grad norm: 7.077 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.393 | TFLOPs: 42.18 | +[default7]: iteration 1155/ 6200 | consumed samples: 1182720 | consumed tokens: 2422210560 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.950092E+00 | loss scale: 2048.0 | grad norm: 6.816 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.758 | TFLOPs: 42.29 | +[default7]: iteration 1156/ 6200 | consumed samples: 1183744 | consumed tokens: 2424307712 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.958176E+00 | loss scale: 2048.0 | grad norm: 7.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.611 | TFLOPs: 42.25 | +[default7]: iteration 1157/ 6200 | consumed samples: 1184768 | consumed tokens: 2426404864 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.948381E+00 | loss scale: 2048.0 | grad norm: 8.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.496 | TFLOPs: 42.21 | +[default7]: iteration 1158/ 6200 | consumed samples: 1185792 | consumed tokens: 2428502016 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.983915E+00 | loss scale: 2048.0 | grad norm: 6.918 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.490 | TFLOPs: 42.21 | +[default7]: iteration 1159/ 6200 | consumed samples: 1186816 | consumed tokens: 2430599168 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.964511E+00 | loss scale: 2048.0 | grad norm: 7.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.617 | TFLOPs: 42.25 | +[default7]: iteration 1160/ 6200 | consumed samples: 1187840 | consumed tokens: 2432696320 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.966379E+00 | loss scale: 2048.0 | grad norm: 7.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.764 | TFLOPs: 42.30 | +[default7]: iteration 1161/ 6200 | consumed samples: 1188864 | consumed tokens: 2434793472 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.963033E+00 | loss scale: 2048.0 | grad norm: 5.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.708 | TFLOPs: 42.28 | +[default7]: iteration 1162/ 6200 | consumed samples: 1189888 | consumed tokens: 2436890624 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.980416E+00 | loss scale: 2048.0 | grad norm: 5.936 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.891 | TFLOPs: 42.34 | +[default7]: iteration 1163/ 6200 | consumed samples: 1190912 | consumed tokens: 2438987776 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.961433E+00 | loss scale: 2048.0 | grad norm: 6.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.868 | TFLOPs: 42.33 | +[default7]: iteration 1164/ 6200 | consumed samples: 1191936 | consumed tokens: 2441084928 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.959197E+00 | loss scale: 2048.0 | grad norm: 6.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.515 | TFLOPs: 42.22 | +[default7]: iteration 1165/ 6200 | consumed samples: 1192960 | consumed tokens: 2443182080 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.980016E+00 | loss scale: 2048.0 | grad norm: 6.060 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.435 | TFLOPs: 42.20 | +[default7]: iteration 1166/ 6200 | consumed samples: 1193984 | consumed tokens: 2445279232 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.979617E+00 | loss scale: 2048.0 | grad norm: 8.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.549 | TFLOPs: 42.23 | +[default7]: iteration 1167/ 6200 | consumed samples: 1195008 | consumed tokens: 2447376384 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.949202E+00 | loss scale: 2048.0 | grad norm: 7.056 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.583 | TFLOPs: 42.24 | +[default7]: iteration 1168/ 6200 | consumed samples: 1196032 | consumed tokens: 2449473536 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.973044E+00 | loss scale: 2048.0 | grad norm: 6.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.465 | TFLOPs: 42.21 | +[default7]: iteration 1169/ 6200 | consumed samples: 1197056 | consumed tokens: 2451570688 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.965304E+00 | loss scale: 2048.0 | grad norm: 10.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.386 | TFLOPs: 42.18 | +[default7]: iteration 1170/ 6200 | consumed samples: 1198080 | consumed tokens: 2453667840 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.987509E+00 | loss scale: 2048.0 | grad norm: 7.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.720 | TFLOPs: 42.28 | +[default7]: iteration 1171/ 6200 | consumed samples: 1199104 | consumed tokens: 2455764992 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.983878E+00 | loss scale: 2048.0 | grad norm: 6.739 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.658 | TFLOPs: 42.26 | +[default7]: iteration 1172/ 6200 | consumed samples: 1200128 | consumed tokens: 2457862144 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.971527E+00 | loss scale: 2048.0 | grad norm: 6.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.660 | TFLOPs: 42.26 | +[default7]: iteration 1173/ 6200 | consumed samples: 1201152 | consumed tokens: 2459959296 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.935266E+00 | loss scale: 2048.0 | grad norm: 5.990 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.950 | TFLOPs: 42.35 | +[default7]: iteration 1174/ 6200 | consumed samples: 1202176 | consumed tokens: 2462056448 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.944705E+00 | loss scale: 2048.0 | grad norm: 6.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.015 | TFLOPs: 42.37 | +[default7]: iteration 1175/ 6200 | consumed samples: 1203200 | consumed tokens: 2464153600 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.963613E+00 | loss scale: 2048.0 | grad norm: 5.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.097 | TFLOPs: 42.40 | +[default7]: iteration 1176/ 6200 | consumed samples: 1204224 | consumed tokens: 2466250752 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.977164E+00 | loss scale: 2048.0 | grad norm: 6.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.673 | TFLOPs: 42.27 | +[default7]: iteration 1177/ 6200 | consumed samples: 1205248 | consumed tokens: 2468347904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.983119E+00 | loss scale: 2048.0 | grad norm: 5.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.747 | TFLOPs: 42.29 | +[default7]: iteration 1178/ 6200 | consumed samples: 1206272 | consumed tokens: 2470445056 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.945201E+00 | loss scale: 2048.0 | grad norm: 5.918 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.716 | TFLOPs: 42.28 | +[default7]: iteration 1179/ 6200 | consumed samples: 1207296 | consumed tokens: 2472542208 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.980268E+00 | loss scale: 2048.0 | grad norm: 8.555 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.717 | TFLOPs: 42.28 | +[default7]: iteration 1180/ 6200 | consumed samples: 1208320 | consumed tokens: 2474639360 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.959142E+00 | loss scale: 2048.0 | grad norm: 6.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.861 | TFLOPs: 42.33 | +[default7]: iteration 1181/ 6200 | consumed samples: 1209344 | consumed tokens: 2476736512 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.996635E+00 | loss scale: 2048.0 | grad norm: 5.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.871 | TFLOPs: 42.33 | +[default7]: iteration 1182/ 6200 | consumed samples: 1210368 | consumed tokens: 2478833664 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.973081E+00 | loss scale: 2048.0 | grad norm: 5.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.785 | TFLOPs: 42.30 | +[default7]: iteration 1183/ 6200 | consumed samples: 1211392 | consumed tokens: 2480930816 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.947342E+00 | loss scale: 2048.0 | grad norm: 7.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.948 | TFLOPs: 42.35 | +[default7]: iteration 1184/ 6200 | consumed samples: 1212416 | consumed tokens: 2483027968 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.987416E+00 | loss scale: 2048.0 | grad norm: 6.852 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.792 | TFLOPs: 42.00 | +[default7]: iteration 1185/ 6200 | consumed samples: 1213440 | consumed tokens: 2485125120 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.959235E+00 | loss scale: 2048.0 | grad norm: 6.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.382 | TFLOPs: 42.18 | +[default7]: iteration 1186/ 6200 | consumed samples: 1214464 | consumed tokens: 2487222272 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.958674E+00 | loss scale: 2048.0 | grad norm: 6.055 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.109 | TFLOPs: 42.10 | +[default7]: iteration 1187/ 6200 | consumed samples: 1215488 | consumed tokens: 2489319424 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.946824E+00 | loss scale: 2048.0 | grad norm: 6.891 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.083 | TFLOPs: 42.09 | +[default7]: iteration 1188/ 6200 | consumed samples: 1216512 | consumed tokens: 2491416576 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.960362E+00 | loss scale: 2048.0 | grad norm: 6.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.853 | TFLOPs: 42.02 | +[default7]: iteration 1189/ 6200 | consumed samples: 1217536 | consumed tokens: 2493513728 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.982683E+00 | loss scale: 2048.0 | grad norm: 6.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.815 | TFLOPs: 42.31 | +[default7]: iteration 1190/ 6200 | consumed samples: 1218560 | consumed tokens: 2495610880 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.004171E+00 | loss scale: 2048.0 | grad norm: 6.684 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.704 | TFLOPs: 42.28 | +[default7]: iteration 1191/ 6200 | consumed samples: 1219584 | consumed tokens: 2497708032 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.937131E+00 | loss scale: 2048.0 | grad norm: 5.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.192 | TFLOPs: 42.12 | +[default7]: iteration 1192/ 6200 | consumed samples: 1220608 | consumed tokens: 2499805184 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.944904E+00 | loss scale: 2048.0 | grad norm: 5.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.602 | TFLOPs: 42.25 | +[default7]: iteration 1193/ 6200 | consumed samples: 1221632 | consumed tokens: 2501902336 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.975203E+00 | loss scale: 2048.0 | grad norm: 5.860 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.616 | TFLOPs: 42.25 | +[default7]: iteration 1194/ 6200 | consumed samples: 1222656 | consumed tokens: 2503999488 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.958681E+00 | loss scale: 2048.0 | grad norm: 6.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.827 | TFLOPs: 42.32 | +[default7]: iteration 1195/ 6200 | consumed samples: 1223680 | consumed tokens: 2506096640 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.961047E+00 | loss scale: 2048.0 | grad norm: 6.896 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.815 | TFLOPs: 42.31 | +[default7]: iteration 1196/ 6200 | consumed samples: 1224704 | consumed tokens: 2508193792 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.978687E+00 | loss scale: 2048.0 | grad norm: 6.851 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.243 | TFLOPs: 42.14 | +[default7]: iteration 1197/ 6200 | consumed samples: 1225728 | consumed tokens: 2510290944 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.976032E+00 | loss scale: 2048.0 | grad norm: 5.924 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.338 | TFLOPs: 42.17 | +[default7]: iteration 1198/ 6200 | consumed samples: 1226752 | consumed tokens: 2512388096 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.965288E+00 | loss scale: 2048.0 | grad norm: 7.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.858 | TFLOPs: 42.33 | +[default7]: iteration 1199/ 6200 | consumed samples: 1227776 | consumed tokens: 2514485248 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.941320E+00 | loss scale: 2048.0 | grad norm: 8.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.985 | TFLOPs: 41.75 | +[default7]: iteration 1200/ 6200 | consumed samples: 1228800 | consumed tokens: 2516582400 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.975904E+00 | loss scale: 2048.0 | grad norm: 6.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.670 | TFLOPs: 41.96 | +[default7]: iteration 1201/ 6200 | consumed samples: 1229824 | consumed tokens: 2518679552 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.976648E+00 | loss scale: 2048.0 | grad norm: 5.947 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.150 | TFLOPs: 41.80 | +[default7]: iteration 1202/ 6200 | consumed samples: 1230848 | consumed tokens: 2520776704 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.969789E+00 | loss scale: 2048.0 | grad norm: 7.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.441 | TFLOPs: 42.20 | +[default7]: iteration 1203/ 6200 | consumed samples: 1231872 | consumed tokens: 2522873856 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.954659E+00 | loss scale: 2048.0 | grad norm: 7.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.024 | TFLOPs: 42.07 | +[default7]: iteration 1204/ 6200 | consumed samples: 1232896 | consumed tokens: 2524971008 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.954477E+00 | loss scale: 2048.0 | grad norm: 6.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.353 | TFLOPs: 42.17 | +[default7]: iteration 1205/ 6200 | consumed samples: 1233920 | consumed tokens: 2527068160 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.930619E+00 | loss scale: 2048.0 | grad norm: 7.013 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.309 | TFLOPs: 42.16 | +[default7]: iteration 1206/ 6200 | consumed samples: 1234944 | consumed tokens: 2529165312 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.987509E+00 | loss scale: 2048.0 | grad norm: 7.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.989 | TFLOPs: 42.06 | +[default7]: iteration 1207/ 6200 | consumed samples: 1235968 | consumed tokens: 2531262464 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.965096E+00 | loss scale: 2048.0 | grad norm: 7.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.928 | TFLOPs: 42.04 | +[default7]: iteration 1208/ 6200 | consumed samples: 1236992 | consumed tokens: 2533359616 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.982844E+00 | loss scale: 2048.0 | grad norm: 6.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.110 | TFLOPs: 42.10 | +[default7]: iteration 1209/ 6200 | consumed samples: 1238016 | consumed tokens: 2535456768 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.971599E+00 | loss scale: 2048.0 | grad norm: 6.883 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.478 | TFLOPs: 42.21 | +[default7]: iteration 1210/ 6200 | consumed samples: 1239040 | consumed tokens: 2537553920 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.960758E+00 | loss scale: 2048.0 | grad norm: 6.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.754 | TFLOPs: 42.29 | +[default7]: iteration 1211/ 6200 | consumed samples: 1240064 | consumed tokens: 2539651072 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.989439E+00 | loss scale: 2048.0 | grad norm: 6.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.514 | TFLOPs: 42.22 | +[default7]: iteration 1212/ 6200 | consumed samples: 1241088 | consumed tokens: 2541748224 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.958298E+00 | loss scale: 2048.0 | grad norm: 5.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.686 | TFLOPs: 42.27 | +[default7]: iteration 1213/ 6200 | consumed samples: 1242112 | consumed tokens: 2543845376 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.963363E+00 | loss scale: 2048.0 | grad norm: 5.702 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.329 | TFLOPs: 42.16 | +[default7]: iteration 1214/ 6200 | consumed samples: 1243136 | consumed tokens: 2545942528 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.976209E+00 | loss scale: 2048.0 | grad norm: 5.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.582 | TFLOPs: 42.24 | +[default7]: iteration 1215/ 6200 | consumed samples: 1244160 | consumed tokens: 2548039680 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.943589E+00 | loss scale: 2048.0 | grad norm: 6.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.616 | TFLOPs: 42.25 | +[default7]: iteration 1216/ 6200 | consumed samples: 1245184 | consumed tokens: 2550136832 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.964246E+00 | loss scale: 2048.0 | grad norm: 5.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.257 | TFLOPs: 42.14 | +[default7]: iteration 1217/ 6200 | consumed samples: 1246208 | consumed tokens: 2552233984 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.937043E+00 | loss scale: 2048.0 | grad norm: 6.965 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.325 | TFLOPs: 42.16 | +[default7]: iteration 1218/ 6200 | consumed samples: 1247232 | consumed tokens: 2554331136 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.969561E+00 | loss scale: 2048.0 | grad norm: 7.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.261 | TFLOPs: 42.14 | +[default7]: iteration 1219/ 6200 | consumed samples: 1248256 | consumed tokens: 2556428288 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.965611E+00 | loss scale: 2048.0 | grad norm: 5.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.397 | TFLOPs: 42.18 | +[default7]: iteration 1220/ 6200 | consumed samples: 1249280 | consumed tokens: 2558525440 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.963257E+00 | loss scale: 2048.0 | grad norm: 5.992 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.679 | TFLOPs: 42.27 | +[default7]: iteration 1221/ 6200 | consumed samples: 1250304 | consumed tokens: 2560622592 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.979593E+00 | loss scale: 2048.0 | grad norm: 6.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.453 | TFLOPs: 42.20 | +[default7]: iteration 1222/ 6200 | consumed samples: 1251328 | consumed tokens: 2562719744 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.949066E+00 | loss scale: 2048.0 | grad norm: 6.594 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.475 | TFLOPs: 42.21 | +[default7]: iteration 1223/ 6200 | consumed samples: 1252352 | consumed tokens: 2564816896 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.942155E+00 | loss scale: 2048.0 | grad norm: 6.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.384 | TFLOPs: 42.18 | +[default7]: iteration 1224/ 6200 | consumed samples: 1253376 | consumed tokens: 2566914048 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.972187E+00 | loss scale: 2048.0 | grad norm: 6.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.518 | TFLOPs: 42.22 | +[default7]: iteration 1225/ 6200 | consumed samples: 1254400 | consumed tokens: 2569011200 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.965095E+00 | loss scale: 2048.0 | grad norm: 6.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.630 | TFLOPs: 42.26 | +[default7]: iteration 1226/ 6200 | consumed samples: 1255424 | consumed tokens: 2571108352 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.948314E+00 | loss scale: 2048.0 | grad norm: 5.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.258 | TFLOPs: 42.14 | +[default7]: iteration 1227/ 6200 | consumed samples: 1256448 | consumed tokens: 2573205504 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.965484E+00 | loss scale: 2048.0 | grad norm: 6.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.213 | TFLOPs: 42.13 | +[default7]: iteration 1228/ 6200 | consumed samples: 1257472 | consumed tokens: 2575302656 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.944159E+00 | loss scale: 2048.0 | grad norm: 6.012 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.520 | TFLOPs: 42.22 | +[default7]: iteration 1229/ 6200 | consumed samples: 1258496 | consumed tokens: 2577399808 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.951838E+00 | loss scale: 2048.0 | grad norm: 6.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.321 | TFLOPs: 42.16 | +[default7]: iteration 1230/ 6200 | consumed samples: 1259520 | consumed tokens: 2579496960 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.975760E+00 | loss scale: 2048.0 | grad norm: 6.859 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.245 | TFLOPs: 42.14 | +[default7]: iteration 1231/ 6200 | consumed samples: 1260544 | consumed tokens: 2581594112 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.964301E+00 | loss scale: 2048.0 | grad norm: 7.019 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.442 | TFLOPs: 42.20 | +[default7]: iteration 1232/ 6200 | consumed samples: 1261568 | consumed tokens: 2583691264 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.966146E+00 | loss scale: 2048.0 | grad norm: 5.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.385 | TFLOPs: 42.18 | +[default7]: iteration 1233/ 6200 | consumed samples: 1262592 | consumed tokens: 2585788416 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.974570E+00 | loss scale: 2048.0 | grad norm: 7.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.746 | TFLOPs: 42.29 | +[default7]: iteration 1234/ 6200 | consumed samples: 1263616 | consumed tokens: 2587885568 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.975170E+00 | loss scale: 2048.0 | grad norm: 5.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.462 | TFLOPs: 42.20 | +[default7]: iteration 1235/ 6200 | consumed samples: 1264640 | consumed tokens: 2589982720 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.952322E+00 | loss scale: 2048.0 | grad norm: 5.703 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.255 | TFLOPs: 42.14 | +[default7]: iteration 1236/ 6200 | consumed samples: 1265664 | consumed tokens: 2592079872 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.945290E+00 | loss scale: 2048.0 | grad norm: 5.975 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.402 | TFLOPs: 42.19 | +[default7]: iteration 1237/ 6200 | consumed samples: 1266688 | consumed tokens: 2594177024 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.964843E+00 | loss scale: 2048.0 | grad norm: 6.761 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.555 | TFLOPs: 42.23 | +[default7]: iteration 1238/ 6200 | consumed samples: 1267712 | consumed tokens: 2596274176 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.967120E+00 | loss scale: 2048.0 | grad norm: 6.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.251 | TFLOPs: 42.14 | +[default7]: iteration 1239/ 6200 | consumed samples: 1268736 | consumed tokens: 2598371328 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.933750E+00 | loss scale: 2048.0 | grad norm: 5.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.922 | TFLOPs: 42.04 | +[default7]: iteration 1240/ 6200 | consumed samples: 1269760 | consumed tokens: 2600468480 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.947647E+00 | loss scale: 2048.0 | grad norm: 6.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.981 | TFLOPs: 42.06 | +[default7]: iteration 1241/ 6200 | consumed samples: 1270784 | consumed tokens: 2602565632 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.942841E+00 | loss scale: 2048.0 | grad norm: 6.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.660 | TFLOPs: 41.96 | +[default7]: iteration 1242/ 6200 | consumed samples: 1271808 | consumed tokens: 2604662784 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.963630E+00 | loss scale: 2048.0 | grad norm: 6.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.081 | TFLOPs: 42.09 | +[default7]: iteration 1243/ 6200 | consumed samples: 1272832 | consumed tokens: 2606759936 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.961105E+00 | loss scale: 2048.0 | grad norm: 7.646 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.185 | TFLOPs: 42.12 | +[default7]: iteration 1244/ 6200 | consumed samples: 1273856 | consumed tokens: 2608857088 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.951670E+00 | loss scale: 2048.0 | grad norm: 6.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.202 | TFLOPs: 42.13 | +[default7]: iteration 1245/ 6200 | consumed samples: 1274880 | consumed tokens: 2610954240 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.962046E+00 | loss scale: 2048.0 | grad norm: 5.682 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.069 | TFLOPs: 42.08 | +[default7]: iteration 1246/ 6200 | consumed samples: 1275904 | consumed tokens: 2613051392 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.959594E+00 | loss scale: 2048.0 | grad norm: 6.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.580 | TFLOPs: 42.24 | +[default7]: iteration 1247/ 6200 | consumed samples: 1276928 | consumed tokens: 2615148544 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.957771E+00 | loss scale: 2048.0 | grad norm: 6.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.345 | TFLOPs: 42.17 | +[default7]: iteration 1248/ 6200 | consumed samples: 1277952 | consumed tokens: 2617245696 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.967530E+00 | loss scale: 2048.0 | grad norm: 6.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.486 | TFLOPs: 42.21 | +[default7]: iteration 1249/ 6200 | consumed samples: 1278976 | consumed tokens: 2619342848 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.948428E+00 | loss scale: 2048.0 | grad norm: 7.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.293 | TFLOPs: 42.15 | +[default7]: iteration 1250/ 6200 | consumed samples: 1280000 | consumed tokens: 2621440000 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.940657E+00 | loss scale: 2048.0 | grad norm: 6.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.472 | TFLOPs: 42.21 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 1250 | lm loss value: 3.420132E+00 | lm loss PPL: 3.057344E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 1250 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 15:24:44,587] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1250 is begin to save! +[default0]:[2022-10-06 15:24:44,593] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_01-model_00-model_states.pt... +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 1250 | lm loss value: 1.853554E+00 | lm loss PPL: 6.382461E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:[2022-10-06 15:24:45,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,035] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,063] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,199] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,225] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,280] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,332] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,387] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,415] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,441] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,467] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,493] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,519] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,574] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,601] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,628] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 15:24:45,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,654] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 15:24:45,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 15:24:45,656] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/mp_rank_00_model_states.pt +[default0]:[2022-10-06 15:24:45,656] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 15:24:45,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 15:24:45,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 15:24:45,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 15:24:45,868] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 15:24:45,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 15:24:45,885] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 15:24:45,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 15:24:45,883] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 15:24:45,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 15:24:45,877] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 15:24:45,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 15:24:45,898] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 15:24:45,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 15:24:45,926] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 15:24:45,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 15:24:45,882] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 15:24:45,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 15:24:45,909] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 15:24:45,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 15:24:45,883] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 15:24:45,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 15:24:45,955] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 15:24:45,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 15:24:45,995] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 15:24:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 15:24:45,998] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 15:24:46,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 15:24:46,010] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 15:24:46,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 15:24:46,005] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 15:24:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 15:24:45,998] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 15:24:45,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 15:24:45,974] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 15:24:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 15:24:45,996] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 15:24:46,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 15:24:46,020] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 15:24:46,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 15:24:46,018] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 15:24:46,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 15:24:46,009] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 15:24:46,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 15:24:46,044] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 15:24:46,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 15:24:46,028] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 15:24:46,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 15:24:46,041] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 15:24:46,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 15:24:46,029] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 15:24:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 15:24:45,996] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 15:24:46,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 15:24:46,033] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 15:24:46,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 15:24:46,010] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 15:24:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 15:24:45,996] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default5]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 15:24:46,120] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default3]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default2]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default0]:[2022-10-06 15:24:46,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 15:24:46,085] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default1]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default6]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default0]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default7]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default2]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default5]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default0]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default0]: successfully saved checkpoint at iteration 1250 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default3]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default4]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default1]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default5]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default6]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default1]:[2022-10-06 15:24:46,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 15:24:46,107] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default6]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default5]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default7]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default7]:time (ms) | save-checkpoint: 1534.14 +[default1]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default7]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default7]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default2]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default4]:[2022-10-06 15:24:46,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 15:24:46,103] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1250/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default6]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default0]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default4]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default3]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default2]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default4]:[2022-10-06 15:24:46,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1250 is ready now! +[default7]: iteration 1251/ 6200 | consumed samples: 1281024 | consumed tokens: 2623537152 | elapsed time per iteration (s): 53.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.967019E+00 | loss scale: 2048.0 | grad norm: 5.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.157 | TFLOPs: 5.84 | +[default7]: iteration 1252/ 6200 | consumed samples: 1282048 | consumed tokens: 2625634304 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.938727E+00 | loss scale: 2048.0 | grad norm: 5.862 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.380 | TFLOPs: 42.18 | +[default7]: iteration 1253/ 6200 | consumed samples: 1283072 | consumed tokens: 2627731456 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.950531E+00 | loss scale: 2048.0 | grad norm: 6.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.545 | TFLOPs: 42.23 | +[default7]: iteration 1254/ 6200 | consumed samples: 1284096 | consumed tokens: 2629828608 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.956789E+00 | loss scale: 2048.0 | grad norm: 7.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.498 | TFLOPs: 42.22 | +[default7]: iteration 1255/ 6200 | consumed samples: 1285120 | consumed tokens: 2631925760 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.979468E+00 | loss scale: 2048.0 | grad norm: 8.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.533 | TFLOPs: 42.23 | +[default7]: iteration 1256/ 6200 | consumed samples: 1286144 | consumed tokens: 2634022912 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.945780E+00 | loss scale: 2048.0 | grad norm: 7.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.639 | TFLOPs: 42.26 | +[default7]: iteration 1257/ 6200 | consumed samples: 1287168 | consumed tokens: 2636120064 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.920080E+00 | loss scale: 2048.0 | grad norm: 5.911 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.636 | TFLOPs: 42.26 | +[default7]: iteration 1258/ 6200 | consumed samples: 1288192 | consumed tokens: 2638217216 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.972506E+00 | loss scale: 2048.0 | grad norm: 8.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.505 | TFLOPs: 42.22 | +[default7]: iteration 1259/ 6200 | consumed samples: 1289216 | consumed tokens: 2640314368 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.974606E+00 | loss scale: 2048.0 | grad norm: 8.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.362 | TFLOPs: 42.17 | +[default7]: iteration 1260/ 6200 | consumed samples: 1290240 | consumed tokens: 2642411520 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.947115E+00 | loss scale: 2048.0 | grad norm: 6.782 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.371 | TFLOPs: 42.18 | +[default7]: iteration 1261/ 6200 | consumed samples: 1291264 | consumed tokens: 2644508672 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.957743E+00 | loss scale: 2048.0 | grad norm: 6.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.322 | TFLOPs: 42.16 | +[default7]: iteration 1262/ 6200 | consumed samples: 1292288 | consumed tokens: 2646605824 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.971354E+00 | loss scale: 2048.0 | grad norm: 5.553 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.493 | TFLOPs: 42.21 | +[default7]: iteration 1263/ 6200 | consumed samples: 1293312 | consumed tokens: 2648702976 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.951702E+00 | loss scale: 2048.0 | grad norm: 7.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.772 | TFLOPs: 42.30 | +[default7]: iteration 1264/ 6200 | consumed samples: 1294336 | consumed tokens: 2650800128 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.969753E+00 | loss scale: 2048.0 | grad norm: 6.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.403 | TFLOPs: 42.19 | +[default7]: iteration 1265/ 6200 | consumed samples: 1295360 | consumed tokens: 2652897280 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.960741E+00 | loss scale: 2048.0 | grad norm: 6.949 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.562 | TFLOPs: 42.23 | +[default7]: iteration 1266/ 6200 | consumed samples: 1296384 | consumed tokens: 2654994432 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.935165E+00 | loss scale: 2048.0 | grad norm: 7.625 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.297 | TFLOPs: 42.15 | +[default7]: iteration 1267/ 6200 | consumed samples: 1297408 | consumed tokens: 2657091584 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.958857E+00 | loss scale: 2048.0 | grad norm: 7.830 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.528 | TFLOPs: 42.22 | +[default7]: iteration 1268/ 6200 | consumed samples: 1298432 | consumed tokens: 2659188736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.964486E+00 | loss scale: 2048.0 | grad norm: 6.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.610 | TFLOPs: 42.25 | +[default7]: iteration 1269/ 6200 | consumed samples: 1299456 | consumed tokens: 2661285888 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.955924E+00 | loss scale: 2048.0 | grad norm: 5.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.655 | TFLOPs: 42.26 | +[default7]: iteration 1270/ 6200 | consumed samples: 1300480 | consumed tokens: 2663383040 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.961844E+00 | loss scale: 2048.0 | grad norm: 7.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.638 | TFLOPs: 42.26 | +[default7]: iteration 1271/ 6200 | consumed samples: 1301504 | consumed tokens: 2665480192 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.977177E+00 | loss scale: 2048.0 | grad norm: 5.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.223 | TFLOPs: 42.13 | +[default7]: iteration 1272/ 6200 | consumed samples: 1302528 | consumed tokens: 2667577344 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.949515E+00 | loss scale: 2048.0 | grad norm: 6.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.418 | TFLOPs: 42.19 | +[default7]: iteration 1273/ 6200 | consumed samples: 1303552 | consumed tokens: 2669674496 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.955372E+00 | loss scale: 2048.0 | grad norm: 6.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.112 | TFLOPs: 42.10 | +[default7]: iteration 1274/ 6200 | consumed samples: 1304576 | consumed tokens: 2671771648 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.956464E+00 | loss scale: 2048.0 | grad norm: 5.766 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.087 | TFLOPs: 42.09 | +[default7]: iteration 1275/ 6200 | consumed samples: 1305600 | consumed tokens: 2673868800 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.954694E+00 | loss scale: 2048.0 | grad norm: 5.815 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.117 | TFLOPs: 42.10 | +[default7]: iteration 1276/ 6200 | consumed samples: 1306624 | consumed tokens: 2675965952 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.973186E+00 | loss scale: 2048.0 | grad norm: 6.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.471 | TFLOPs: 42.21 | +[default7]: iteration 1277/ 6200 | consumed samples: 1307648 | consumed tokens: 2678063104 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.941061E+00 | loss scale: 2048.0 | grad norm: 5.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.287 | TFLOPs: 42.15 | +[default7]: iteration 1278/ 6200 | consumed samples: 1308672 | consumed tokens: 2680160256 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.951620E+00 | loss scale: 2048.0 | grad norm: 5.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.081 | TFLOPs: 42.09 | +[default7]: iteration 1279/ 6200 | consumed samples: 1309696 | consumed tokens: 2682257408 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.954289E+00 | loss scale: 2048.0 | grad norm: 6.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.609 | TFLOPs: 42.25 | +[default7]: iteration 1280/ 6200 | consumed samples: 1310720 | consumed tokens: 2684354560 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.969638E+00 | loss scale: 2048.0 | grad norm: 6.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.901 | TFLOPs: 42.03 | +[default7]: iteration 1281/ 6200 | consumed samples: 1311744 | consumed tokens: 2686451712 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.910900E+00 | loss scale: 2048.0 | grad norm: 5.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.551 | TFLOPs: 42.23 | +[default7]: iteration 1282/ 6200 | consumed samples: 1312768 | consumed tokens: 2688548864 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.976737E+00 | loss scale: 2048.0 | grad norm: 5.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.732 | TFLOPs: 42.29 | +[default7]: iteration 1283/ 6200 | consumed samples: 1313792 | consumed tokens: 2690646016 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.975137E+00 | loss scale: 2048.0 | grad norm: 6.878 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.517 | TFLOPs: 42.22 | +[default7]: iteration 1284/ 6200 | consumed samples: 1314816 | consumed tokens: 2692743168 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.948513E+00 | loss scale: 2048.0 | grad norm: 6.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.785 | TFLOPs: 42.30 | +[default7]: iteration 1285/ 6200 | consumed samples: 1315840 | consumed tokens: 2694840320 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.929582E+00 | loss scale: 2048.0 | grad norm: 5.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.926 | TFLOPs: 42.35 | +[default7]: iteration 1286/ 6200 | consumed samples: 1316864 | consumed tokens: 2696937472 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.945688E+00 | loss scale: 2048.0 | grad norm: 6.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.627 | TFLOPs: 42.25 | +[default7]: iteration 1287/ 6200 | consumed samples: 1317888 | consumed tokens: 2699034624 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.962006E+00 | loss scale: 2048.0 | grad norm: 5.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.516 | TFLOPs: 42.22 | +[default7]: iteration 1288/ 6200 | consumed samples: 1318912 | consumed tokens: 2701131776 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.962324E+00 | loss scale: 2048.0 | grad norm: 5.812 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.534 | TFLOPs: 42.23 | +[default7]: iteration 1289/ 6200 | consumed samples: 1319936 | consumed tokens: 2703228928 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.947179E+00 | loss scale: 2048.0 | grad norm: 6.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.520 | TFLOPs: 42.22 | +[default7]: iteration 1290/ 6200 | consumed samples: 1320960 | consumed tokens: 2705326080 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.940632E+00 | loss scale: 2048.0 | grad norm: 6.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.519 | TFLOPs: 42.22 | +[default7]: iteration 1291/ 6200 | consumed samples: 1321984 | consumed tokens: 2707423232 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.929114E+00 | loss scale: 2048.0 | grad norm: 5.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.585 | TFLOPs: 42.24 | +[default7]: iteration 1292/ 6200 | consumed samples: 1323008 | consumed tokens: 2709520384 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.936512E+00 | loss scale: 2048.0 | grad norm: 5.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.624 | TFLOPs: 42.25 | +[default7]: iteration 1293/ 6200 | consumed samples: 1324032 | consumed tokens: 2711617536 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.963988E+00 | loss scale: 2048.0 | grad norm: 5.980 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.373 | TFLOPs: 42.18 | +[default7]: iteration 1294/ 6200 | consumed samples: 1325056 | consumed tokens: 2713714688 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.924432E+00 | loss scale: 2048.0 | grad norm: 8.793 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.488 | TFLOPs: 42.21 | +[default7]: iteration 1295/ 6200 | consumed samples: 1326080 | consumed tokens: 2715811840 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.952403E+00 | loss scale: 2048.0 | grad norm: 6.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.362 | TFLOPs: 42.17 | +[default7]: iteration 1296/ 6200 | consumed samples: 1327104 | consumed tokens: 2717908992 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.953217E+00 | loss scale: 2048.0 | grad norm: 7.711 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.304 | TFLOPs: 42.16 | +[default7]: iteration 1297/ 6200 | consumed samples: 1328128 | consumed tokens: 2720006144 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.958850E+00 | loss scale: 2048.0 | grad norm: 7.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.388 | TFLOPs: 42.18 | +[default7]: iteration 1298/ 6200 | consumed samples: 1329152 | consumed tokens: 2722103296 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.950774E+00 | loss scale: 2048.0 | grad norm: 6.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.612 | TFLOPs: 42.25 | +[default7]: iteration 1299/ 6200 | consumed samples: 1330176 | consumed tokens: 2724200448 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.918455E+00 | loss scale: 2048.0 | grad norm: 5.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.338 | TFLOPs: 42.17 | +[default7]: iteration 1300/ 6200 | consumed samples: 1331200 | consumed tokens: 2726297600 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.942752E+00 | loss scale: 2048.0 | grad norm: 9.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.399 | TFLOPs: 42.19 | +[default7]: iteration 1301/ 6200 | consumed samples: 1332224 | consumed tokens: 2728394752 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.926842E+00 | loss scale: 2048.0 | grad norm: 6.575 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.074 | TFLOPs: 42.09 | +[default7]: iteration 1302/ 6200 | consumed samples: 1333248 | consumed tokens: 2730491904 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.960216E+00 | loss scale: 2048.0 | grad norm: 5.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.386 | TFLOPs: 42.18 | +[default7]: iteration 1303/ 6200 | consumed samples: 1334272 | consumed tokens: 2732589056 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.955201E+00 | loss scale: 2048.0 | grad norm: 6.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.511 | TFLOPs: 42.22 | +[default7]: iteration 1304/ 6200 | consumed samples: 1335296 | consumed tokens: 2734686208 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.955222E+00 | loss scale: 2048.0 | grad norm: 5.779 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.120 | TFLOPs: 42.10 | +[default7]: iteration 1305/ 6200 | consumed samples: 1336320 | consumed tokens: 2736783360 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.955266E+00 | loss scale: 2048.0 | grad norm: 5.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.791 | TFLOPs: 42.00 | +[default7]: iteration 1306/ 6200 | consumed samples: 1337344 | consumed tokens: 2738880512 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.953541E+00 | loss scale: 2048.0 | grad norm: 7.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.675 | TFLOPs: 42.27 | +[default7]: iteration 1307/ 6200 | consumed samples: 1338368 | consumed tokens: 2740977664 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.951531E+00 | loss scale: 2048.0 | grad norm: 5.774 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.498 | TFLOPs: 42.22 | +[default7]: iteration 1308/ 6200 | consumed samples: 1339392 | consumed tokens: 2743074816 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.922167E+00 | loss scale: 2048.0 | grad norm: 6.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.457 | TFLOPs: 42.20 | +[default7]: iteration 1309/ 6200 | consumed samples: 1340416 | consumed tokens: 2745171968 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.946634E+00 | loss scale: 2048.0 | grad norm: 6.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.496 | TFLOPs: 42.21 | +[default7]: iteration 1310/ 6200 | consumed samples: 1341440 | consumed tokens: 2747269120 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.955958E+00 | loss scale: 2048.0 | grad norm: 5.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.664 | TFLOPs: 42.27 | +[default7]: iteration 1311/ 6200 | consumed samples: 1342464 | consumed tokens: 2749366272 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.948244E+00 | loss scale: 2048.0 | grad norm: 5.747 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.657 | TFLOPs: 42.26 | +[default7]: iteration 1312/ 6200 | consumed samples: 1343488 | consumed tokens: 2751463424 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.940102E+00 | loss scale: 2048.0 | grad norm: 5.976 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.372 | TFLOPs: 42.18 | +[default7]: iteration 1313/ 6200 | consumed samples: 1344512 | consumed tokens: 2753560576 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.945784E+00 | loss scale: 2048.0 | grad norm: 6.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.467 | TFLOPs: 42.21 | +[default7]: iteration 1314/ 6200 | consumed samples: 1345536 | consumed tokens: 2755657728 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.942541E+00 | loss scale: 2048.0 | grad norm: 8.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.671 | TFLOPs: 42.27 | +[default7]: iteration 1315/ 6200 | consumed samples: 1346560 | consumed tokens: 2757754880 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.945179E+00 | loss scale: 2048.0 | grad norm: 7.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.885 | TFLOPs: 42.33 | +[default7]: iteration 1316/ 6200 | consumed samples: 1347584 | consumed tokens: 2759852032 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.950551E+00 | loss scale: 2048.0 | grad norm: 7.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.525 | TFLOPs: 42.22 | +[default7]: iteration 1317/ 6200 | consumed samples: 1348608 | consumed tokens: 2761949184 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.937906E+00 | loss scale: 2048.0 | grad norm: 6.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.254 | TFLOPs: 42.14 | +[default7]: iteration 1318/ 6200 | consumed samples: 1349632 | consumed tokens: 2764046336 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.936732E+00 | loss scale: 2048.0 | grad norm: 6.026 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.283 | TFLOPs: 42.15 | +[default7]: iteration 1319/ 6200 | consumed samples: 1350656 | consumed tokens: 2766143488 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.948017E+00 | loss scale: 2048.0 | grad norm: 6.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.019 | TFLOPs: 42.07 | +[default7]: iteration 1320/ 6200 | consumed samples: 1351680 | consumed tokens: 2768240640 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.957555E+00 | loss scale: 2048.0 | grad norm: 5.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.952 | TFLOPs: 42.05 | +[default7]: iteration 1321/ 6200 | consumed samples: 1352704 | consumed tokens: 2770337792 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.930094E+00 | loss scale: 2048.0 | grad norm: 5.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.231 | TFLOPs: 42.13 | +[default7]: iteration 1322/ 6200 | consumed samples: 1353728 | consumed tokens: 2772434944 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.942676E+00 | loss scale: 2048.0 | grad norm: 5.775 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.191 | TFLOPs: 42.12 | +[default7]: iteration 1323/ 6200 | consumed samples: 1354752 | consumed tokens: 2774532096 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.931730E+00 | loss scale: 2048.0 | grad norm: 6.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.143 | TFLOPs: 42.11 | +[default7]: iteration 1324/ 6200 | consumed samples: 1355776 | consumed tokens: 2776629248 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.960578E+00 | loss scale: 2048.0 | grad norm: 6.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.014 | TFLOPs: 42.07 | +[default7]: iteration 1325/ 6200 | consumed samples: 1356800 | consumed tokens: 2778726400 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.928368E+00 | loss scale: 2048.0 | grad norm: 7.675 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.867 | TFLOPs: 42.02 | +[default7]: iteration 1326/ 6200 | consumed samples: 1357824 | consumed tokens: 2780823552 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.934541E+00 | loss scale: 2048.0 | grad norm: 7.943 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.428 | TFLOPs: 41.89 | +[default7]: iteration 1327/ 6200 | consumed samples: 1358848 | consumed tokens: 2782920704 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.944618E+00 | loss scale: 2048.0 | grad norm: 6.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.711 | TFLOPs: 41.98 | +[default7]: iteration 1328/ 6200 | consumed samples: 1359872 | consumed tokens: 2785017856 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.968517E+00 | loss scale: 2048.0 | grad norm: 7.847 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.504 | TFLOPs: 41.91 | +[default7]: iteration 1329/ 6200 | consumed samples: 1360896 | consumed tokens: 2787115008 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.952428E+00 | loss scale: 2048.0 | grad norm: 7.886 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.051 | TFLOPs: 42.08 | +[default7]: iteration 1330/ 6200 | consumed samples: 1361920 | consumed tokens: 2789212160 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.943789E+00 | loss scale: 2048.0 | grad norm: 6.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.143 | TFLOPs: 42.11 | +[default7]: iteration 1331/ 6200 | consumed samples: 1362944 | consumed tokens: 2791309312 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.917819E+00 | loss scale: 2048.0 | grad norm: 5.981 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.642 | TFLOPs: 41.95 | +[default7]: iteration 1332/ 6200 | consumed samples: 1363968 | consumed tokens: 2793406464 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.924258E+00 | loss scale: 2048.0 | grad norm: 8.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.136 | TFLOPs: 42.10 | +[default7]: iteration 1333/ 6200 | consumed samples: 1364992 | consumed tokens: 2795503616 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.924143E+00 | loss scale: 2048.0 | grad norm: 5.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.336 | TFLOPs: 42.17 | +[default7]: iteration 1334/ 6200 | consumed samples: 1366016 | consumed tokens: 2797600768 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.938908E+00 | loss scale: 2048.0 | grad norm: 6.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.776 | TFLOPs: 42.00 | +[default7]: iteration 1335/ 6200 | consumed samples: 1367040 | consumed tokens: 2799697920 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.946617E+00 | loss scale: 2048.0 | grad norm: 6.805 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.892 | TFLOPs: 42.03 | +[default7]: iteration 1336/ 6200 | consumed samples: 1368064 | consumed tokens: 2801795072 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.956274E+00 | loss scale: 2048.0 | grad norm: 7.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.060 | TFLOPs: 42.08 | +[default7]: iteration 1337/ 6200 | consumed samples: 1369088 | consumed tokens: 2803892224 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.927407E+00 | loss scale: 2048.0 | grad norm: 5.743 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.710 | TFLOPs: 41.98 | +[default7]: iteration 1338/ 6200 | consumed samples: 1370112 | consumed tokens: 2805989376 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.940401E+00 | loss scale: 2048.0 | grad norm: 5.996 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.523 | TFLOPs: 41.92 | +[default7]: iteration 1339/ 6200 | consumed samples: 1371136 | consumed tokens: 2808086528 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.931354E+00 | loss scale: 2048.0 | grad norm: 8.954 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.751 | TFLOPs: 41.99 | +[default7]: iteration 1340/ 6200 | consumed samples: 1372160 | consumed tokens: 2810183680 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.965021E+00 | loss scale: 2048.0 | grad norm: 8.975 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.852 | TFLOPs: 42.02 | +[default7]: iteration 1341/ 6200 | consumed samples: 1373184 | consumed tokens: 2812280832 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.952228E+00 | loss scale: 2048.0 | grad norm: 6.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.160 | TFLOPs: 42.11 | +[default7]: iteration 1342/ 6200 | consumed samples: 1374208 | consumed tokens: 2814377984 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.941660E+00 | loss scale: 2048.0 | grad norm: 7.738 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.315 | TFLOPs: 42.16 | +[default7]: iteration 1343/ 6200 | consumed samples: 1375232 | consumed tokens: 2816475136 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.946459E+00 | loss scale: 2048.0 | grad norm: 8.744 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.441 | TFLOPs: 42.20 | +[default7]: iteration 1344/ 6200 | consumed samples: 1376256 | consumed tokens: 2818572288 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.951002E+00 | loss scale: 2048.0 | grad norm: 6.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.291 | TFLOPs: 42.15 | +[default7]: iteration 1345/ 6200 | consumed samples: 1377280 | consumed tokens: 2820669440 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.941368E+00 | loss scale: 2048.0 | grad norm: 5.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.323 | TFLOPs: 42.16 | +[default7]: iteration 1346/ 6200 | consumed samples: 1378304 | consumed tokens: 2822766592 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.932653E+00 | loss scale: 2048.0 | grad norm: 7.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.971 | TFLOPs: 42.05 | +[default7]: iteration 1347/ 6200 | consumed samples: 1379328 | consumed tokens: 2824863744 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.958937E+00 | loss scale: 2048.0 | grad norm: 5.860 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.317 | TFLOPs: 42.16 | +[default7]: iteration 1348/ 6200 | consumed samples: 1380352 | consumed tokens: 2826960896 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.940792E+00 | loss scale: 2048.0 | grad norm: 6.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.172 | TFLOPs: 42.12 | +[default7]: iteration 1349/ 6200 | consumed samples: 1381376 | consumed tokens: 2829058048 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.922699E+00 | loss scale: 2048.0 | grad norm: 6.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.909 | TFLOPs: 42.04 | +[default7]: iteration 1350/ 6200 | consumed samples: 1382400 | consumed tokens: 2831155200 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.948530E+00 | loss scale: 2048.0 | grad norm: 7.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.589 | TFLOPs: 41.94 | +[default7]: iteration 1351/ 6200 | consumed samples: 1383424 | consumed tokens: 2833252352 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.942047E+00 | loss scale: 2048.0 | grad norm: 7.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.772 | TFLOPs: 41.99 | +[default7]: iteration 1352/ 6200 | consumed samples: 1384448 | consumed tokens: 2835349504 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.944826E+00 | loss scale: 2048.0 | grad norm: 6.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.715 | TFLOPs: 41.98 | +[default7]: iteration 1353/ 6200 | consumed samples: 1385472 | consumed tokens: 2837446656 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.923368E+00 | loss scale: 2048.0 | grad norm: 6.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.042 | TFLOPs: 42.08 | +[default7]: iteration 1354/ 6200 | consumed samples: 1386496 | consumed tokens: 2839543808 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.936273E+00 | loss scale: 2048.0 | grad norm: 5.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.976 | TFLOPs: 42.06 | +[default7]: iteration 1355/ 6200 | consumed samples: 1387520 | consumed tokens: 2841640960 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.953816E+00 | loss scale: 2048.0 | grad norm: 6.606 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.864 | TFLOPs: 42.02 | +[default7]: iteration 1356/ 6200 | consumed samples: 1388544 | consumed tokens: 2843738112 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.941232E+00 | loss scale: 2048.0 | grad norm: 6.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.763 | TFLOPs: 41.99 | +[default7]: iteration 1357/ 6200 | consumed samples: 1389568 | consumed tokens: 2845835264 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.958987E+00 | loss scale: 2048.0 | grad norm: 6.643 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.727 | TFLOPs: 41.98 | +[default7]: iteration 1358/ 6200 | consumed samples: 1390592 | consumed tokens: 2847932416 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.950736E+00 | loss scale: 2048.0 | grad norm: 7.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.197 | TFLOPs: 42.12 | +[default7]: iteration 1359/ 6200 | consumed samples: 1391616 | consumed tokens: 2850029568 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.944479E+00 | loss scale: 2048.0 | grad norm: 6.582 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.046 | TFLOPs: 42.08 | +[default7]: iteration 1360/ 6200 | consumed samples: 1392640 | consumed tokens: 2852126720 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.913584E+00 | loss scale: 2048.0 | grad norm: 5.973 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.269 | TFLOPs: 42.15 | +[default7]: iteration 1361/ 6200 | consumed samples: 1393664 | consumed tokens: 2854223872 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.953055E+00 | loss scale: 2048.0 | grad norm: 5.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.604 | TFLOPs: 41.94 | +[default7]: iteration 1362/ 6200 | consumed samples: 1394688 | consumed tokens: 2856321024 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.937204E+00 | loss scale: 2048.0 | grad norm: 6.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.768 | TFLOPs: 41.99 | +[default7]: iteration 1363/ 6200 | consumed samples: 1395712 | consumed tokens: 2858418176 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.942411E+00 | loss scale: 2048.0 | grad norm: 6.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.106 | TFLOPs: 42.10 | +[default7]: iteration 1364/ 6200 | consumed samples: 1396736 | consumed tokens: 2860515328 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.929814E+00 | loss scale: 2048.0 | grad norm: 5.784 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.967 | TFLOPs: 42.05 | +[default7]: iteration 1365/ 6200 | consumed samples: 1397760 | consumed tokens: 2862612480 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.944427E+00 | loss scale: 2048.0 | grad norm: 7.782 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.801 | TFLOPs: 42.00 | +[default7]: iteration 1366/ 6200 | consumed samples: 1398784 | consumed tokens: 2864709632 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.922703E+00 | loss scale: 2048.0 | grad norm: 7.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.111 | TFLOPs: 42.10 | +[default7]: iteration 1367/ 6200 | consumed samples: 1399808 | consumed tokens: 2866806784 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.939397E+00 | loss scale: 2048.0 | grad norm: 6.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.824 | TFLOPs: 42.01 | +[default7]: iteration 1368/ 6200 | consumed samples: 1400832 | consumed tokens: 2868903936 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.938943E+00 | loss scale: 2048.0 | grad norm: 8.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.826 | TFLOPs: 42.01 | +[default7]: iteration 1369/ 6200 | consumed samples: 1401856 | consumed tokens: 2871001088 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.916247E+00 | loss scale: 2048.0 | grad norm: 6.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.798 | TFLOPs: 42.00 | +[default7]: iteration 1370/ 6200 | consumed samples: 1402880 | consumed tokens: 2873098240 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.925663E+00 | loss scale: 2048.0 | grad norm: 6.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.159 | TFLOPs: 42.11 | +[default7]: iteration 1371/ 6200 | consumed samples: 1403904 | consumed tokens: 2875195392 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.939959E+00 | loss scale: 2048.0 | grad norm: 6.072 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.962 | TFLOPs: 42.05 | +[default7]: iteration 1372/ 6200 | consumed samples: 1404928 | consumed tokens: 2877292544 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.953471E+00 | loss scale: 2048.0 | grad norm: 6.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.735 | TFLOPs: 41.98 | +[default7]: iteration 1373/ 6200 | consumed samples: 1405952 | consumed tokens: 2879389696 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.948099E+00 | loss scale: 2048.0 | grad norm: 6.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.954 | TFLOPs: 42.05 | +[default7]: iteration 1374/ 6200 | consumed samples: 1406976 | consumed tokens: 2881486848 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.925328E+00 | loss scale: 2048.0 | grad norm: 6.926 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.744 | TFLOPs: 41.99 | +[default7]: iteration 1375/ 6200 | consumed samples: 1408000 | consumed tokens: 2883584000 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.948892E+00 | loss scale: 2048.0 | grad norm: 5.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.198 | TFLOPs: 42.12 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 1375 | lm loss value: 3.449607E+00 | lm loss PPL: 3.148800E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 1375 | lm loss value: 1.826234E+00 | lm loss PPL: 6.210457E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 1376/ 6200 | consumed samples: 1409024 | consumed tokens: 2885681152 | elapsed time per iteration (s): 51.80 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.972765E+00 | loss scale: 2048.0 | grad norm: 6.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.769 | TFLOPs: 6.03 | +[default7]: iteration 1377/ 6200 | consumed samples: 1410048 | consumed tokens: 2887778304 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.921649E+00 | loss scale: 2048.0 | grad norm: 6.068 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.598 | TFLOPs: 41.94 | +[default7]: iteration 1378/ 6200 | consumed samples: 1411072 | consumed tokens: 2889875456 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.976181E+00 | loss scale: 2048.0 | grad norm: 6.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.276 | TFLOPs: 42.15 | +[default7]: iteration 1379/ 6200 | consumed samples: 1412096 | consumed tokens: 2891972608 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.969414E+00 | loss scale: 2048.0 | grad norm: 6.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.674 | TFLOPs: 41.96 | +[default7]: iteration 1380/ 6200 | consumed samples: 1413120 | consumed tokens: 2894069760 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.950297E+00 | loss scale: 2048.0 | grad norm: 5.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.823 | TFLOPs: 42.01 | +[default7]: iteration 1381/ 6200 | consumed samples: 1414144 | consumed tokens: 2896166912 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.946981E+00 | loss scale: 2048.0 | grad norm: 6.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.721 | TFLOPs: 41.98 | +[default7]: iteration 1382/ 6200 | consumed samples: 1415168 | consumed tokens: 2898264064 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.921640E+00 | loss scale: 2048.0 | grad norm: 6.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.222 | TFLOPs: 42.13 | +[default7]: iteration 1383/ 6200 | consumed samples: 1416192 | consumed tokens: 2900361216 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.946157E+00 | loss scale: 2048.0 | grad norm: 6.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.986 | TFLOPs: 42.06 | +[default7]: iteration 1384/ 6200 | consumed samples: 1417216 | consumed tokens: 2902458368 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.944346E+00 | loss scale: 2048.0 | grad norm: 6.723 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.660 | TFLOPs: 41.96 | +[default7]: iteration 1385/ 6200 | consumed samples: 1418240 | consumed tokens: 2904555520 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.928401E+00 | loss scale: 2048.0 | grad norm: 6.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.560 | TFLOPs: 41.93 | +[default7]: iteration 1386/ 6200 | consumed samples: 1419264 | consumed tokens: 2906652672 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.956260E+00 | loss scale: 2048.0 | grad norm: 7.664 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.522 | TFLOPs: 41.92 | +[default7]: iteration 1387/ 6200 | consumed samples: 1420288 | consumed tokens: 2908749824 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.920675E+00 | loss scale: 2048.0 | grad norm: 8.706 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.754 | TFLOPs: 41.99 | +[default7]: iteration 1388/ 6200 | consumed samples: 1421312 | consumed tokens: 2910846976 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.936044E+00 | loss scale: 2048.0 | grad norm: 7.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.148 | TFLOPs: 42.11 | +[default7]: iteration 1389/ 6200 | consumed samples: 1422336 | consumed tokens: 2912944128 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.947566E+00 | loss scale: 2048.0 | grad norm: 7.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.540 | TFLOPs: 41.92 | +[default7]: iteration 1390/ 6200 | consumed samples: 1423360 | consumed tokens: 2915041280 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.955960E+00 | loss scale: 2048.0 | grad norm: 7.943 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.384 | TFLOPs: 41.88 | +[default7]: iteration 1391/ 6200 | consumed samples: 1424384 | consumed tokens: 2917138432 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.916053E+00 | loss scale: 2048.0 | grad norm: 5.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.573 | TFLOPs: 41.93 | +[default7]: iteration 1392/ 6200 | consumed samples: 1425408 | consumed tokens: 2919235584 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.913339E+00 | loss scale: 2048.0 | grad norm: 6.024 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.682 | TFLOPs: 41.97 | +[default7]: iteration 1393/ 6200 | consumed samples: 1426432 | consumed tokens: 2921332736 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.938436E+00 | loss scale: 2048.0 | grad norm: 7.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.741 | TFLOPs: 41.98 | +[default7]: iteration 1394/ 6200 | consumed samples: 1427456 | consumed tokens: 2923429888 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.918481E+00 | loss scale: 2048.0 | grad norm: 5.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.620 | TFLOPs: 41.95 | +[default7]: iteration 1395/ 6200 | consumed samples: 1428480 | consumed tokens: 2925527040 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.922132E+00 | loss scale: 2048.0 | grad norm: 6.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.278 | TFLOPs: 41.84 | +[default7]: iteration 1396/ 6200 | consumed samples: 1429504 | consumed tokens: 2927624192 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.923794E+00 | loss scale: 2048.0 | grad norm: 5.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.438 | TFLOPs: 41.89 | +[default7]: iteration 1397/ 6200 | consumed samples: 1430528 | consumed tokens: 2929721344 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.912315E+00 | loss scale: 2048.0 | grad norm: 7.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.397 | TFLOPs: 41.88 | +[default7]: iteration 1398/ 6200 | consumed samples: 1431552 | consumed tokens: 2931818496 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.945730E+00 | loss scale: 2048.0 | grad norm: 5.972 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.942 | TFLOPs: 42.05 | +[default7]: iteration 1399/ 6200 | consumed samples: 1432576 | consumed tokens: 2933915648 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.917131E+00 | loss scale: 2048.0 | grad norm: 6.026 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.532 | TFLOPs: 41.92 | +[default7]: iteration 1400/ 6200 | consumed samples: 1433600 | consumed tokens: 2936012800 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.925578E+00 | loss scale: 2048.0 | grad norm: 6.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.702 | TFLOPs: 41.97 | +[default7]: iteration 1401/ 6200 | consumed samples: 1434624 | consumed tokens: 2938109952 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.923815E+00 | loss scale: 2048.0 | grad norm: 5.711 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.565 | TFLOPs: 41.93 | +[default7]: iteration 1402/ 6200 | consumed samples: 1435648 | consumed tokens: 2940207104 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.915472E+00 | loss scale: 2048.0 | grad norm: 6.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.658 | TFLOPs: 41.96 | +[default7]: iteration 1403/ 6200 | consumed samples: 1436672 | consumed tokens: 2942304256 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.932923E+00 | loss scale: 2048.0 | grad norm: 7.064 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.784 | TFLOPs: 41.69 | +[default7]: iteration 1404/ 6200 | consumed samples: 1437696 | consumed tokens: 2944401408 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.942353E+00 | loss scale: 2048.0 | grad norm: 5.994 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.430 | TFLOPs: 41.89 | +[default7]: iteration 1405/ 6200 | consumed samples: 1438720 | consumed tokens: 2946498560 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.930112E+00 | loss scale: 2048.0 | grad norm: 5.600 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.533 | TFLOPs: 41.92 | +[default7]: iteration 1406/ 6200 | consumed samples: 1439744 | consumed tokens: 2948595712 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.945220E+00 | loss scale: 2048.0 | grad norm: 5.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.538 | TFLOPs: 41.92 | +[default7]: iteration 1407/ 6200 | consumed samples: 1440768 | consumed tokens: 2950692864 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.933237E+00 | loss scale: 2048.0 | grad norm: 5.902 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.663 | TFLOPs: 41.96 | +[default7]: iteration 1408/ 6200 | consumed samples: 1441792 | consumed tokens: 2952790016 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.953076E+00 | loss scale: 2048.0 | grad norm: 5.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.383 | TFLOPs: 41.88 | +[default7]: iteration 1409/ 6200 | consumed samples: 1442816 | consumed tokens: 2954887168 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.914373E+00 | loss scale: 2048.0 | grad norm: 7.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.651 | TFLOPs: 41.96 | +[default7]: iteration 1410/ 6200 | consumed samples: 1443840 | consumed tokens: 2956984320 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.959160E+00 | loss scale: 2048.0 | grad norm: 5.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.991 | TFLOPs: 42.06 | +[default7]: iteration 1411/ 6200 | consumed samples: 1444864 | consumed tokens: 2959081472 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.913350E+00 | loss scale: 2048.0 | grad norm: 5.836 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.564 | TFLOPs: 41.93 | +[default7]: iteration 1412/ 6200 | consumed samples: 1445888 | consumed tokens: 2961178624 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.923085E+00 | loss scale: 2048.0 | grad norm: 6.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.703 | TFLOPs: 41.97 | +[default7]: iteration 1413/ 6200 | consumed samples: 1446912 | consumed tokens: 2963275776 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.940950E+00 | loss scale: 2048.0 | grad norm: 6.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.015 | TFLOPs: 42.07 | +[default7]: iteration 1414/ 6200 | consumed samples: 1447936 | consumed tokens: 2965372928 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.981035E+00 | loss scale: 2048.0 | grad norm: 8.067 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.213 | TFLOPs: 41.82 | +[default7]: iteration 1415/ 6200 | consumed samples: 1448960 | consumed tokens: 2967470080 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.930977E+00 | loss scale: 2048.0 | grad norm: 6.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.285 | TFLOPs: 41.85 | +[default7]: iteration 1416/ 6200 | consumed samples: 1449984 | consumed tokens: 2969567232 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.972878E+00 | loss scale: 2048.0 | grad norm: 6.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.564 | TFLOPs: 41.93 | +[default7]: iteration 1417/ 6200 | consumed samples: 1451008 | consumed tokens: 2971664384 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.917847E+00 | loss scale: 2048.0 | grad norm: 6.661 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.206 | TFLOPs: 41.82 | +[default7]: iteration 1418/ 6200 | consumed samples: 1452032 | consumed tokens: 2973761536 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.941798E+00 | loss scale: 2048.0 | grad norm: 5.620 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.462 | TFLOPs: 41.90 | +[default7]: iteration 1419/ 6200 | consumed samples: 1453056 | consumed tokens: 2975858688 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.906683E+00 | loss scale: 2048.0 | grad norm: 7.021 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.391 | TFLOPs: 41.88 | +[default7]: iteration 1420/ 6200 | consumed samples: 1454080 | consumed tokens: 2977955840 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.944927E+00 | loss scale: 2048.0 | grad norm: 7.758 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.950 | TFLOPs: 42.05 | +[default7]: iteration 1421/ 6200 | consumed samples: 1455104 | consumed tokens: 2980052992 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.939914E+00 | loss scale: 2048.0 | grad norm: 6.918 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.003 | TFLOPs: 42.06 | +[default7]: iteration 1422/ 6200 | consumed samples: 1456128 | consumed tokens: 2982150144 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.954472E+00 | loss scale: 2048.0 | grad norm: 8.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.009 | TFLOPs: 42.07 | +[default7]: iteration 1423/ 6200 | consumed samples: 1457152 | consumed tokens: 2984247296 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.901567E+00 | loss scale: 2048.0 | grad norm: 6.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.929 | TFLOPs: 42.04 | +[default7]: iteration 1424/ 6200 | consumed samples: 1458176 | consumed tokens: 2986344448 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.940004E+00 | loss scale: 2048.0 | grad norm: 5.765 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.338 | TFLOPs: 42.17 | +[default7]: iteration 1425/ 6200 | consumed samples: 1459200 | consumed tokens: 2988441600 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.918560E+00 | loss scale: 2048.0 | grad norm: 7.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.888 | TFLOPs: 42.03 | +[default7]: iteration 1426/ 6200 | consumed samples: 1460224 | consumed tokens: 2990538752 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.916537E+00 | loss scale: 2048.0 | grad norm: 6.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.606 | TFLOPs: 42.25 | +[default7]: iteration 1427/ 6200 | consumed samples: 1461248 | consumed tokens: 2992635904 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.958223E+00 | loss scale: 2048.0 | grad norm: 8.985 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.070 | TFLOPs: 42.09 | +[default7]: iteration 1428/ 6200 | consumed samples: 1462272 | consumed tokens: 2994733056 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.957937E+00 | loss scale: 2048.0 | grad norm: 9.026 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.373 | TFLOPs: 42.18 | +[default7]: iteration 1429/ 6200 | consumed samples: 1463296 | consumed tokens: 2996830208 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.931823E+00 | loss scale: 2048.0 | grad norm: 7.990 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.759 | TFLOPs: 42.29 | +[default7]: iteration 1430/ 6200 | consumed samples: 1464320 | consumed tokens: 2998927360 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.911422E+00 | loss scale: 2048.0 | grad norm: 7.045 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.231 | TFLOPs: 42.13 | +[default7]: iteration 1431/ 6200 | consumed samples: 1465344 | consumed tokens: 3001024512 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.942891E+00 | loss scale: 2048.0 | grad norm: 7.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.202 | TFLOPs: 42.13 | +[default7]: iteration 1432/ 6200 | consumed samples: 1466368 | consumed tokens: 3003121664 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.923146E+00 | loss scale: 2048.0 | grad norm: 6.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.073 | TFLOPs: 42.09 | +[default7]: iteration 1433/ 6200 | consumed samples: 1467392 | consumed tokens: 3005218816 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.935852E+00 | loss scale: 2048.0 | grad norm: 5.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.450 | TFLOPs: 42.20 | +[default7]: iteration 1434/ 6200 | consumed samples: 1468416 | consumed tokens: 3007315968 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.906603E+00 | loss scale: 2048.0 | grad norm: 6.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.394 | TFLOPs: 42.18 | +[default7]: iteration 1435/ 6200 | consumed samples: 1469440 | consumed tokens: 3009413120 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.911771E+00 | loss scale: 2048.0 | grad norm: 5.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.701 | TFLOPs: 42.28 | +[default7]: iteration 1436/ 6200 | consumed samples: 1470464 | consumed tokens: 3011510272 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.924619E+00 | loss scale: 2048.0 | grad norm: 6.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.544 | TFLOPs: 42.23 | +[default7]: iteration 1437/ 6200 | consumed samples: 1471488 | consumed tokens: 3013607424 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.923281E+00 | loss scale: 2048.0 | grad norm: 5.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.566 | TFLOPs: 42.24 | +[default7]: iteration 1438/ 6200 | consumed samples: 1472512 | consumed tokens: 3015704576 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.911406E+00 | loss scale: 2048.0 | grad norm: 7.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.494 | TFLOPs: 42.21 | +[default7]: iteration 1439/ 6200 | consumed samples: 1473536 | consumed tokens: 3017801728 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.939498E+00 | loss scale: 2048.0 | grad norm: 6.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.291 | TFLOPs: 42.15 | +[default7]: iteration 1440/ 6200 | consumed samples: 1474560 | consumed tokens: 3019898880 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.933281E+00 | loss scale: 2048.0 | grad norm: 10.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.398 | TFLOPs: 42.18 | +[default7]: iteration 1441/ 6200 | consumed samples: 1475584 | consumed tokens: 3021996032 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.939988E+00 | loss scale: 2048.0 | grad norm: 6.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.088 | TFLOPs: 42.09 | +[default7]: iteration 1442/ 6200 | consumed samples: 1476608 | consumed tokens: 3024093184 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.912603E+00 | loss scale: 2048.0 | grad norm: 5.929 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.220 | TFLOPs: 42.13 | +[default7]: iteration 1443/ 6200 | consumed samples: 1477632 | consumed tokens: 3026190336 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.936392E+00 | loss scale: 2048.0 | grad norm: 6.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.263 | TFLOPs: 42.14 | +[default7]: iteration 1444/ 6200 | consumed samples: 1478656 | consumed tokens: 3028287488 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.937752E+00 | loss scale: 2048.0 | grad norm: 5.043 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.856 | TFLOPs: 42.02 | +[default7]: iteration 1445/ 6200 | consumed samples: 1479680 | consumed tokens: 3030384640 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.903406E+00 | loss scale: 2048.0 | grad norm: 5.822 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.729 | TFLOPs: 41.98 | +[default7]: iteration 1446/ 6200 | consumed samples: 1480704 | consumed tokens: 3032481792 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.909257E+00 | loss scale: 2048.0 | grad norm: 8.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.833 | TFLOPs: 42.01 | +[default7]: iteration 1447/ 6200 | consumed samples: 1481728 | consumed tokens: 3034578944 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.922392E+00 | loss scale: 2048.0 | grad norm: 7.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.165 | TFLOPs: 42.11 | +[default7]: iteration 1448/ 6200 | consumed samples: 1482752 | consumed tokens: 3036676096 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.930640E+00 | loss scale: 2048.0 | grad norm: 5.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.878 | TFLOPs: 42.03 | +[default7]: iteration 1449/ 6200 | consumed samples: 1483776 | consumed tokens: 3038773248 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.937721E+00 | loss scale: 2048.0 | grad norm: 6.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.962 | TFLOPs: 42.05 | +[default7]: iteration 1450/ 6200 | consumed samples: 1484800 | consumed tokens: 3040870400 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.939459E+00 | loss scale: 2048.0 | grad norm: 7.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.316 | TFLOPs: 42.16 | +[default7]: iteration 1451/ 6200 | consumed samples: 1485824 | consumed tokens: 3042967552 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.919389E+00 | loss scale: 2048.0 | grad norm: 5.995 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.384 | TFLOPs: 42.18 | +[default7]: iteration 1452/ 6200 | consumed samples: 1486848 | consumed tokens: 3045064704 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.898970E+00 | loss scale: 2048.0 | grad norm: 6.029 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.053 | TFLOPs: 42.08 | +[default7]: iteration 1453/ 6200 | consumed samples: 1487872 | consumed tokens: 3047161856 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.940413E+00 | loss scale: 2048.0 | grad norm: 7.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.318 | TFLOPs: 42.16 | +[default7]: iteration 1454/ 6200 | consumed samples: 1488896 | consumed tokens: 3049259008 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.951347E+00 | loss scale: 2048.0 | grad norm: 7.917 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.036 | TFLOPs: 42.07 | +[default7]: iteration 1455/ 6200 | consumed samples: 1489920 | consumed tokens: 3051356160 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.911857E+00 | loss scale: 2048.0 | grad norm: 5.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.994 | TFLOPs: 42.06 | +[default7]: iteration 1456/ 6200 | consumed samples: 1490944 | consumed tokens: 3053453312 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.903968E+00 | loss scale: 2048.0 | grad norm: 6.710 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.928 | TFLOPs: 42.04 | +[default7]: iteration 1457/ 6200 | consumed samples: 1491968 | consumed tokens: 3055550464 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.913381E+00 | loss scale: 2048.0 | grad norm: 5.782 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.980 | TFLOPs: 42.06 | +[default7]: iteration 1458/ 6200 | consumed samples: 1492992 | consumed tokens: 3057647616 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.906199E+00 | loss scale: 2048.0 | grad norm: 6.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.270 | TFLOPs: 42.15 | +[default7]: iteration 1459/ 6200 | consumed samples: 1494016 | consumed tokens: 3059744768 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.917654E+00 | loss scale: 2048.0 | grad norm: 5.041 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.215 | TFLOPs: 42.13 | +[default7]: iteration 1460/ 6200 | consumed samples: 1495040 | consumed tokens: 3061841920 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.942428E+00 | loss scale: 2048.0 | grad norm: 6.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.176 | TFLOPs: 42.12 | +[default7]: iteration 1461/ 6200 | consumed samples: 1496064 | consumed tokens: 3063939072 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.919243E+00 | loss scale: 2048.0 | grad norm: 6.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.961 | TFLOPs: 42.05 | +[default7]: iteration 1462/ 6200 | consumed samples: 1497088 | consumed tokens: 3066036224 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.939560E+00 | loss scale: 2048.0 | grad norm: 5.901 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.932 | TFLOPs: 42.04 | +[default7]: iteration 1463/ 6200 | consumed samples: 1498112 | consumed tokens: 3068133376 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.902257E+00 | loss scale: 2048.0 | grad norm: 5.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.946 | TFLOPs: 42.05 | +[default7]: iteration 1464/ 6200 | consumed samples: 1499136 | consumed tokens: 3070230528 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.926360E+00 | loss scale: 2048.0 | grad norm: 6.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.073 | TFLOPs: 42.09 | +[default7]: iteration 1465/ 6200 | consumed samples: 1500160 | consumed tokens: 3072327680 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.928545E+00 | loss scale: 2048.0 | grad norm: 6.749 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.987 | TFLOPs: 42.06 | +[default7]: iteration 1466/ 6200 | consumed samples: 1501184 | consumed tokens: 3074424832 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.953911E+00 | loss scale: 2048.0 | grad norm: 5.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.028 | TFLOPs: 42.07 | +[default7]: iteration 1467/ 6200 | consumed samples: 1502208 | consumed tokens: 3076521984 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.945162E+00 | loss scale: 2048.0 | grad norm: 5.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.858 | TFLOPs: 42.02 | +[default7]: iteration 1468/ 6200 | consumed samples: 1503232 | consumed tokens: 3078619136 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.917181E+00 | loss scale: 2048.0 | grad norm: 6.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.011 | TFLOPs: 42.07 | +[default7]: iteration 1469/ 6200 | consumed samples: 1504256 | consumed tokens: 3080716288 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.882492E+00 | loss scale: 2048.0 | grad norm: 6.019 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.846 | TFLOPs: 42.02 | +[default7]: iteration 1470/ 6200 | consumed samples: 1505280 | consumed tokens: 3082813440 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.917479E+00 | loss scale: 2048.0 | grad norm: 5.660 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.750 | TFLOPs: 41.99 | +[default7]: iteration 1471/ 6200 | consumed samples: 1506304 | consumed tokens: 3084910592 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.926656E+00 | loss scale: 2048.0 | grad norm: 5.593 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.077 | TFLOPs: 42.09 | +[default7]: iteration 1472/ 6200 | consumed samples: 1507328 | consumed tokens: 3087007744 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.905558E+00 | loss scale: 2048.0 | grad norm: 5.742 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.094 | TFLOPs: 42.09 | +[default7]: iteration 1473/ 6200 | consumed samples: 1508352 | consumed tokens: 3089104896 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.916630E+00 | loss scale: 2048.0 | grad norm: 5.639 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.889 | TFLOPs: 42.03 | +[default7]: iteration 1474/ 6200 | consumed samples: 1509376 | consumed tokens: 3091202048 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.914846E+00 | loss scale: 2048.0 | grad norm: 5.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.574 | TFLOPs: 41.93 | +[default7]: iteration 1475/ 6200 | consumed samples: 1510400 | consumed tokens: 3093299200 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.934544E+00 | loss scale: 2048.0 | grad norm: 5.762 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.173 | TFLOPs: 42.12 | +[default7]: iteration 1476/ 6200 | consumed samples: 1511424 | consumed tokens: 3095396352 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.906749E+00 | loss scale: 2048.0 | grad norm: 5.926 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.892 | TFLOPs: 42.03 | +[default7]: iteration 1477/ 6200 | consumed samples: 1512448 | consumed tokens: 3097493504 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.955363E+00 | loss scale: 2048.0 | grad norm: 7.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.859 | TFLOPs: 42.02 | +[default7]: iteration 1478/ 6200 | consumed samples: 1513472 | consumed tokens: 3099590656 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.915489E+00 | loss scale: 2048.0 | grad norm: 5.971 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.132 | TFLOPs: 42.10 | +[default7]: iteration 1479/ 6200 | consumed samples: 1514496 | consumed tokens: 3101687808 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.919478E+00 | loss scale: 2048.0 | grad norm: 5.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.932 | TFLOPs: 42.04 | +[default7]: iteration 1480/ 6200 | consumed samples: 1515520 | consumed tokens: 3103784960 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.888863E+00 | loss scale: 2048.0 | grad norm: 7.657 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.809 | TFLOPs: 42.01 | +[default7]: iteration 1481/ 6200 | consumed samples: 1516544 | consumed tokens: 3105882112 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.925940E+00 | loss scale: 2048.0 | grad norm: 5.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.826 | TFLOPs: 42.01 | +[default7]: iteration 1482/ 6200 | consumed samples: 1517568 | consumed tokens: 3107979264 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.890243E+00 | loss scale: 2048.0 | grad norm: 6.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.577 | TFLOPs: 41.93 | +[default7]: iteration 1483/ 6200 | consumed samples: 1518592 | consumed tokens: 3110076416 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.933514E+00 | loss scale: 2048.0 | grad norm: 5.838 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.887 | TFLOPs: 42.03 | +[default7]: iteration 1484/ 6200 | consumed samples: 1519616 | consumed tokens: 3112173568 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.905887E+00 | loss scale: 2048.0 | grad norm: 5.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.930 | TFLOPs: 42.04 | +[default7]: iteration 1485/ 6200 | consumed samples: 1520640 | consumed tokens: 3114270720 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.945771E+00 | loss scale: 2048.0 | grad norm: 5.848 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.955 | TFLOPs: 42.05 | +[default7]: iteration 1486/ 6200 | consumed samples: 1521664 | consumed tokens: 3116367872 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.917038E+00 | loss scale: 2048.0 | grad norm: 7.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.272 | TFLOPs: 42.15 | +[default7]: iteration 1487/ 6200 | consumed samples: 1522688 | consumed tokens: 3118465024 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.928584E+00 | loss scale: 2048.0 | grad norm: 6.664 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.251 | TFLOPs: 42.14 | +[default7]: iteration 1488/ 6200 | consumed samples: 1523712 | consumed tokens: 3120562176 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.918161E+00 | loss scale: 2048.0 | grad norm: 5.996 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.992 | TFLOPs: 42.06 | +[default7]: iteration 1489/ 6200 | consumed samples: 1524736 | consumed tokens: 3122659328 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.902709E+00 | loss scale: 2048.0 | grad norm: 6.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.886 | TFLOPs: 42.03 | +[default7]: iteration 1490/ 6200 | consumed samples: 1525760 | consumed tokens: 3124756480 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.904550E+00 | loss scale: 2048.0 | grad norm: 7.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.293 | TFLOPs: 42.15 | +[default7]: iteration 1491/ 6200 | consumed samples: 1526784 | consumed tokens: 3126853632 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.930355E+00 | loss scale: 2048.0 | grad norm: 6.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.183 | TFLOPs: 42.12 | +[default7]: iteration 1492/ 6200 | consumed samples: 1527808 | consumed tokens: 3128950784 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.918068E+00 | loss scale: 2048.0 | grad norm: 7.054 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.344 | TFLOPs: 42.17 | +[default7]: iteration 1493/ 6200 | consumed samples: 1528832 | consumed tokens: 3131047936 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.913581E+00 | loss scale: 2048.0 | grad norm: 7.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.227 | TFLOPs: 42.13 | +[default7]: iteration 1494/ 6200 | consumed samples: 1529856 | consumed tokens: 3133145088 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.891239E+00 | loss scale: 2048.0 | grad norm: 6.664 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.387 | TFLOPs: 41.88 | +[default7]: iteration 1495/ 6200 | consumed samples: 1530880 | consumed tokens: 3135242240 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.916825E+00 | loss scale: 2048.0 | grad norm: 6.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.292 | TFLOPs: 41.85 | +[default7]: iteration 1496/ 6200 | consumed samples: 1531904 | consumed tokens: 3137339392 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.894580E+00 | loss scale: 2048.0 | grad norm: 11.012 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.283 | TFLOPs: 41.84 | +[default7]: iteration 1497/ 6200 | consumed samples: 1532928 | consumed tokens: 3139436544 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.916291E+00 | loss scale: 2048.0 | grad norm: 6.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.844 | TFLOPs: 42.02 | +[default7]: iteration 1498/ 6200 | consumed samples: 1533952 | consumed tokens: 3141533696 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.899408E+00 | loss scale: 2048.0 | grad norm: 5.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.594 | TFLOPs: 41.94 | +[default7]: iteration 1499/ 6200 | consumed samples: 1534976 | consumed tokens: 3143630848 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.927772E+00 | loss scale: 2048.0 | grad norm: 7.620 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.765 | TFLOPs: 41.99 | +[default7]: iteration 1500/ 6200 | consumed samples: 1536000 | consumed tokens: 3145728000 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.919995E+00 | loss scale: 2048.0 | grad norm: 7.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.544 | TFLOPs: 42.23 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 1500 | lm loss value: 3.452170E+00 | lm loss PPL: 3.156882E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 1500 | lm loss value: 1.811947E+00 | lm loss PPL: 6.122359E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 1500 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 15:57:09,329] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1500 is begin to save! +[default0]:[2022-10-06 15:57:09,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:09,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:09,740] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:09,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:09,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:09,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:09,797] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:09,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:09,824] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:09,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:09,852] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:09,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:09,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:09,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:09,918] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:09,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:09,946] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:09,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:09,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,030] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,057] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,085] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,113] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,169] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,198] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,227] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,254] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,282] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,341] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,368] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 15:57:10,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,422] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 15:57:10,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 15:57:10,424] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/mp_rank_00_model_states.pt +[default0]:[2022-10-06 15:57:10,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 15:57:10,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 15:57:10,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 15:57:10,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 15:57:10,624] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 15:57:10,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 15:57:10,646] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 15:57:10,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 15:57:10,657] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 15:57:10,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 15:57:10,695] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 15:57:10,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 15:57:10,631] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 15:57:10,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 15:57:10,645] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 15:57:10,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 15:57:10,694] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 15:57:10,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 15:57:10,720] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 15:57:10,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 15:57:10,689] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 15:57:10,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 15:57:10,751] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 15:57:10,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 15:57:10,748] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 15:57:10,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 15:57:10,743] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 15:57:10,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 15:57:10,747] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 15:57:10,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 15:57:10,763] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 15:57:10,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 15:57:10,715] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 15:57:10,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 15:57:10,748] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 15:57:10,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 15:57:10,709] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 15:57:10,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 15:57:10,706] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 15:57:10,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 15:57:10,748] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 15:57:10,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 15:57:10,746] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 15:57:10,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 15:57:10,714] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 15:57:10,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 15:57:10,729] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 15:57:10,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 15:57:10,799] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 15:57:10,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 15:57:10,748] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 15:57:10,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 15:57:10,734] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 15:57:10,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 15:57:10,749] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 15:57:10,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 15:57:10,766] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 15:57:10,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 15:57:10,821] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 15:57:10,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 15:57:10,795] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 15:57:10,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 15:57:10,844] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default6]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default3]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default2]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default4]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default6]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default2]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default2]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default7]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default0]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default1]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default3]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default7]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default4]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default5]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default5]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default0]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default1]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default1]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default4]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default4]:[2022-10-06 15:57:10,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 15:57:10,918] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default3]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default7]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default7]:time (ms) | save-checkpoint: 1589.83 +[default7]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default0]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default6]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default5]:[2022-10-06 15:57:10,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 15:57:10,913] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1500/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default5]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default6]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default3]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default1]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default0]:[2022-10-06 15:57:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1500 is ready now! +[default0]: successfully saved checkpoint at iteration 1500 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default7]: iteration 1501/ 6200 | consumed samples: 1537024 | consumed tokens: 3147825152 | elapsed time per iteration (s): 53.62 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.931046E+00 | loss scale: 2048.0 | grad norm: 6.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.097 | TFLOPs: 5.82 | +[default7]: iteration 1502/ 6200 | consumed samples: 1538048 | consumed tokens: 3149922304 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.913213E+00 | loss scale: 2048.0 | grad norm: 6.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.664 | TFLOPs: 41.96 | +[default7]: iteration 1503/ 6200 | consumed samples: 1539072 | consumed tokens: 3152019456 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.903641E+00 | loss scale: 2048.0 | grad norm: 7.843 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.178 | TFLOPs: 42.12 | +[default7]: iteration 1504/ 6200 | consumed samples: 1540096 | consumed tokens: 3154116608 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.910118E+00 | loss scale: 4096.0 | grad norm: 5.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.160 | TFLOPs: 42.11 | +[default7]: iteration 1505/ 6200 | consumed samples: 1541120 | consumed tokens: 3156213760 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.888629E+00 | loss scale: 4096.0 | grad norm: 5.929 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.062 | TFLOPs: 42.08 | +[default7]: iteration 1506/ 6200 | consumed samples: 1542144 | consumed tokens: 3158310912 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.934883E+00 | loss scale: 4096.0 | grad norm: 5.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.303 | TFLOPs: 42.16 | +[default7]: iteration 1507/ 6200 | consumed samples: 1543168 | consumed tokens: 3160408064 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.902532E+00 | loss scale: 4096.0 | grad norm: 5.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.630 | TFLOPs: 41.95 | +[default7]: iteration 1508/ 6200 | consumed samples: 1544192 | consumed tokens: 3162505216 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.908789E+00 | loss scale: 4096.0 | grad norm: 6.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.460 | TFLOPs: 42.20 | +[default7]: iteration 1509/ 6200 | consumed samples: 1545216 | consumed tokens: 3164602368 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.920485E+00 | loss scale: 4096.0 | grad norm: 5.777 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.448 | TFLOPs: 42.20 | +[default7]: iteration 1510/ 6200 | consumed samples: 1546240 | consumed tokens: 3166699520 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.917054E+00 | loss scale: 4096.0 | grad norm: 6.865 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.859 | TFLOPs: 42.02 | +[default7]: iteration 1511/ 6200 | consumed samples: 1547264 | consumed tokens: 3168796672 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.944185E+00 | loss scale: 4096.0 | grad norm: 5.639 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.415 | TFLOPs: 42.19 | +[default7]: iteration 1512/ 6200 | consumed samples: 1548288 | consumed tokens: 3170893824 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.924608E+00 | loss scale: 4096.0 | grad norm: 6.014 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.211 | TFLOPs: 42.13 | +[default7]: iteration 1513/ 6200 | consumed samples: 1549312 | consumed tokens: 3172990976 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.929118E+00 | loss scale: 4096.0 | grad norm: 4.897 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.356 | TFLOPs: 42.17 | +[default7]: iteration 1514/ 6200 | consumed samples: 1550336 | consumed tokens: 3175088128 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.917935E+00 | loss scale: 4096.0 | grad norm: 5.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.166 | TFLOPs: 42.11 | +[default7]: iteration 1515/ 6200 | consumed samples: 1551360 | consumed tokens: 3177185280 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.904751E+00 | loss scale: 4096.0 | grad norm: 6.823 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.169 | TFLOPs: 42.12 | +[default7]: iteration 1516/ 6200 | consumed samples: 1552384 | consumed tokens: 3179282432 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.906054E+00 | loss scale: 4096.0 | grad norm: 6.756 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.154 | TFLOPs: 42.11 | +[default7]: iteration 1517/ 6200 | consumed samples: 1553408 | consumed tokens: 3181379584 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.936401E+00 | loss scale: 4096.0 | grad norm: 5.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.402 | TFLOPs: 42.19 | +[default7]: iteration 1518/ 6200 | consumed samples: 1554432 | consumed tokens: 3183476736 | elapsed time per iteration (s): 7.27 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.899449E+00 | loss scale: 4096.0 | grad norm: 5.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 140.840 | TFLOPs: 42.93 | +[default0]:[2022-10-06 15:59:24,194] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096.0, reducing to 4096.0 +[default7]: iteration 1519/ 6200 | consumed samples: 1555456 | consumed tokens: 3185573888 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.912560E+00 | loss scale: 4096.0 | grad norm: 5.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.492 | TFLOPs: 42.21 | +[default7]: iteration 1520/ 6200 | consumed samples: 1556480 | consumed tokens: 3187671040 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.906007E+00 | loss scale: 4096.0 | grad norm: 5.744 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.365 | TFLOPs: 42.17 | +[default7]: iteration 1521/ 6200 | consumed samples: 1557504 | consumed tokens: 3189768192 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.910268E+00 | loss scale: 4096.0 | grad norm: 5.725 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.144 | TFLOPs: 42.11 | +[default7]: iteration 1522/ 6200 | consumed samples: 1558528 | consumed tokens: 3191865344 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.937264E+00 | loss scale: 4096.0 | grad norm: 7.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.170 | TFLOPs: 42.12 | +[default7]: iteration 1523/ 6200 | consumed samples: 1559552 | consumed tokens: 3193962496 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.888309E+00 | loss scale: 4096.0 | grad norm: 6.071 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.115 | TFLOPs: 42.10 | +[default7]: iteration 1524/ 6200 | consumed samples: 1560576 | consumed tokens: 3196059648 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.910279E+00 | loss scale: 4096.0 | grad norm: 7.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.107 | TFLOPs: 42.10 | +[default7]: iteration 1525/ 6200 | consumed samples: 1561600 | consumed tokens: 3198156800 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.898914E+00 | loss scale: 4096.0 | grad norm: 6.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.151 | TFLOPs: 42.11 | +[default7]: iteration 1526/ 6200 | consumed samples: 1562624 | consumed tokens: 3200253952 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.905111E+00 | loss scale: 4096.0 | grad norm: 5.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.119 | TFLOPs: 42.10 | +[default7]: iteration 1527/ 6200 | consumed samples: 1563648 | consumed tokens: 3202351104 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.921360E+00 | loss scale: 4096.0 | grad norm: 5.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.897 | TFLOPs: 42.03 | +[default7]: iteration 1528/ 6200 | consumed samples: 1564672 | consumed tokens: 3204448256 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.931556E+00 | loss scale: 4096.0 | grad norm: 6.837 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.796 | TFLOPs: 42.00 | +[default7]: iteration 1529/ 6200 | consumed samples: 1565696 | consumed tokens: 3206545408 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.906005E+00 | loss scale: 4096.0 | grad norm: 5.697 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.193 | TFLOPs: 42.12 | +[default7]: iteration 1530/ 6200 | consumed samples: 1566720 | consumed tokens: 3208642560 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.911614E+00 | loss scale: 4096.0 | grad norm: 6.895 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.698 | TFLOPs: 41.97 | +[default7]: iteration 1531/ 6200 | consumed samples: 1567744 | consumed tokens: 3210739712 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.904641E+00 | loss scale: 4096.0 | grad norm: 5.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.041 | TFLOPs: 41.77 | +[default7]: iteration 1532/ 6200 | consumed samples: 1568768 | consumed tokens: 3212836864 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.904452E+00 | loss scale: 4096.0 | grad norm: 5.769 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.941 | TFLOPs: 42.05 | +[default7]: iteration 1533/ 6200 | consumed samples: 1569792 | consumed tokens: 3214934016 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.919875E+00 | loss scale: 4096.0 | grad norm: 5.649 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.873 | TFLOPs: 42.02 | +[default7]: iteration 1534/ 6200 | consumed samples: 1570816 | consumed tokens: 3217031168 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.902728E+00 | loss scale: 4096.0 | grad norm: 5.602 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.219 | TFLOPs: 42.13 | +[default7]: iteration 1535/ 6200 | consumed samples: 1571840 | consumed tokens: 3219128320 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.887543E+00 | loss scale: 4096.0 | grad norm: 5.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.109 | TFLOPs: 42.10 | +[default7]: iteration 1536/ 6200 | consumed samples: 1572864 | consumed tokens: 3221225472 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.908263E+00 | loss scale: 4096.0 | grad norm: 5.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.866 | TFLOPs: 42.02 | +[default7]: iteration 1537/ 6200 | consumed samples: 1573888 | consumed tokens: 3223322624 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.907731E+00 | loss scale: 4096.0 | grad norm: 5.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.129 | TFLOPs: 42.10 | +[default7]: iteration 1538/ 6200 | consumed samples: 1574912 | consumed tokens: 3225419776 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.913311E+00 | loss scale: 4096.0 | grad norm: 5.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.183 | TFLOPs: 42.12 | +[default7]: iteration 1539/ 6200 | consumed samples: 1575936 | consumed tokens: 3227516928 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.893548E+00 | loss scale: 4096.0 | grad norm: 6.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.883 | TFLOPs: 42.03 | +[default7]: iteration 1540/ 6200 | consumed samples: 1576960 | consumed tokens: 3229614080 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.892922E+00 | loss scale: 4096.0 | grad norm: 5.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.713 | TFLOPs: 41.98 | +[default7]: iteration 1541/ 6200 | consumed samples: 1577984 | consumed tokens: 3231711232 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.878855E+00 | loss scale: 4096.0 | grad norm: 6.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.185 | TFLOPs: 42.12 | +[default7]: iteration 1542/ 6200 | consumed samples: 1579008 | consumed tokens: 3233808384 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.917849E+00 | loss scale: 4096.0 | grad norm: 7.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.148 | TFLOPs: 42.11 | +[default7]: iteration 1543/ 6200 | consumed samples: 1580032 | consumed tokens: 3235905536 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.908920E+00 | loss scale: 4096.0 | grad norm: 5.761 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.412 | TFLOPs: 42.19 | +[default7]: iteration 1544/ 6200 | consumed samples: 1581056 | consumed tokens: 3238002688 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.929135E+00 | loss scale: 4096.0 | grad norm: 6.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.087 | TFLOPs: 42.09 | +[default7]: iteration 1545/ 6200 | consumed samples: 1582080 | consumed tokens: 3240099840 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.909609E+00 | loss scale: 4096.0 | grad norm: 5.937 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.949 | TFLOPs: 42.05 | +[default7]: iteration 1546/ 6200 | consumed samples: 1583104 | consumed tokens: 3242196992 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.870308E+00 | loss scale: 4096.0 | grad norm: 6.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.051 | TFLOPs: 42.08 | +[default7]: iteration 1547/ 6200 | consumed samples: 1584128 | consumed tokens: 3244294144 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.917608E+00 | loss scale: 4096.0 | grad norm: 5.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.414 | TFLOPs: 42.19 | +[default7]: iteration 1548/ 6200 | consumed samples: 1585152 | consumed tokens: 3246391296 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.918617E+00 | loss scale: 4096.0 | grad norm: 7.896 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.411 | TFLOPs: 42.19 | +[default7]: iteration 1549/ 6200 | consumed samples: 1586176 | consumed tokens: 3248488448 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895046E+00 | loss scale: 4096.0 | grad norm: 5.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.167 | TFLOPs: 42.11 | +[default7]: iteration 1550/ 6200 | consumed samples: 1587200 | consumed tokens: 3250585600 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.915115E+00 | loss scale: 4096.0 | grad norm: 5.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.372 | TFLOPs: 42.18 | +[default7]: iteration 1551/ 6200 | consumed samples: 1588224 | consumed tokens: 3252682752 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.892782E+00 | loss scale: 4096.0 | grad norm: 5.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.368 | TFLOPs: 42.18 | +[default7]: iteration 1552/ 6200 | consumed samples: 1589248 | consumed tokens: 3254779904 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.901508E+00 | loss scale: 4096.0 | grad norm: 5.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.497 | TFLOPs: 42.22 | +[default7]: iteration 1553/ 6200 | consumed samples: 1590272 | consumed tokens: 3256877056 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.914011E+00 | loss scale: 4096.0 | grad norm: 7.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.469 | TFLOPs: 42.21 | +[default7]: iteration 1554/ 6200 | consumed samples: 1591296 | consumed tokens: 3258974208 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.896027E+00 | loss scale: 4096.0 | grad norm: 7.705 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.381 | TFLOPs: 42.18 | +[default7]: iteration 1555/ 6200 | consumed samples: 1592320 | consumed tokens: 3261071360 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.942913E+00 | loss scale: 4096.0 | grad norm: 6.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.103 | TFLOPs: 42.09 | +[default7]: iteration 1556/ 6200 | consumed samples: 1593344 | consumed tokens: 3263168512 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.923681E+00 | loss scale: 4096.0 | grad norm: 5.743 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.336 | TFLOPs: 42.17 | +[default7]: iteration 1557/ 6200 | consumed samples: 1594368 | consumed tokens: 3265265664 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.921806E+00 | loss scale: 4096.0 | grad norm: 7.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.549 | TFLOPs: 42.23 | +[default7]: iteration 1558/ 6200 | consumed samples: 1595392 | consumed tokens: 3267362816 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.893256E+00 | loss scale: 4096.0 | grad norm: 6.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.181 | TFLOPs: 42.12 | +[default7]: iteration 1559/ 6200 | consumed samples: 1596416 | consumed tokens: 3269459968 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.921271E+00 | loss scale: 4096.0 | grad norm: 5.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.331 | TFLOPs: 42.16 | +[default7]: iteration 1560/ 6200 | consumed samples: 1597440 | consumed tokens: 3271557120 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.894161E+00 | loss scale: 4096.0 | grad norm: 8.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.352 | TFLOPs: 42.17 | +[default7]: iteration 1561/ 6200 | consumed samples: 1598464 | consumed tokens: 3273654272 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.915173E+00 | loss scale: 4096.0 | grad norm: 5.922 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.610 | TFLOPs: 42.25 | +[default7]: iteration 1562/ 6200 | consumed samples: 1599488 | consumed tokens: 3275751424 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.926522E+00 | loss scale: 4096.0 | grad norm: 5.926 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.062 | TFLOPs: 42.08 | +[default7]: iteration 1563/ 6200 | consumed samples: 1600512 | consumed tokens: 3277848576 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.901878E+00 | loss scale: 4096.0 | grad norm: 6.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.405 | TFLOPs: 42.19 | +[default7]: iteration 1564/ 6200 | consumed samples: 1601536 | consumed tokens: 3279945728 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.894349E+00 | loss scale: 4096.0 | grad norm: 5.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.389 | TFLOPs: 42.18 | +[default7]: iteration 1565/ 6200 | consumed samples: 1602560 | consumed tokens: 3282042880 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.914670E+00 | loss scale: 4096.0 | grad norm: 6.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.653 | TFLOPs: 42.26 | +[default7]: iteration 1566/ 6200 | consumed samples: 1603584 | consumed tokens: 3284140032 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.907196E+00 | loss scale: 4096.0 | grad norm: 7.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.440 | TFLOPs: 42.20 | +[default7]: iteration 1567/ 6200 | consumed samples: 1604608 | consumed tokens: 3286237184 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.922996E+00 | loss scale: 4096.0 | grad norm: 7.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.345 | TFLOPs: 42.17 | +[default7]: iteration 1568/ 6200 | consumed samples: 1605632 | consumed tokens: 3288334336 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.886066E+00 | loss scale: 4096.0 | grad norm: 5.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.297 | TFLOPs: 42.15 | +[default7]: iteration 1569/ 6200 | consumed samples: 1606656 | consumed tokens: 3290431488 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.908914E+00 | loss scale: 4096.0 | grad norm: 5.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.399 | TFLOPs: 42.19 | +[default7]: iteration 1570/ 6200 | consumed samples: 1607680 | consumed tokens: 3292528640 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.898998E+00 | loss scale: 4096.0 | grad norm: 5.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.316 | TFLOPs: 42.16 | +[default7]: iteration 1571/ 6200 | consumed samples: 1608704 | consumed tokens: 3294625792 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.913471E+00 | loss scale: 4096.0 | grad norm: 5.923 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.205 | TFLOPs: 42.13 | +[default7]: iteration 1572/ 6200 | consumed samples: 1609728 | consumed tokens: 3296722944 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.913658E+00 | loss scale: 4096.0 | grad norm: 5.967 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.317 | TFLOPs: 42.16 | +[default7]: iteration 1573/ 6200 | consumed samples: 1610752 | consumed tokens: 3298820096 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.932800E+00 | loss scale: 4096.0 | grad norm: 5.621 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.349 | TFLOPs: 42.17 | +[default7]: iteration 1574/ 6200 | consumed samples: 1611776 | consumed tokens: 3300917248 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.888298E+00 | loss scale: 4096.0 | grad norm: 6.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.402 | TFLOPs: 42.19 | +[default7]: iteration 1575/ 6200 | consumed samples: 1612800 | consumed tokens: 3303014400 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.906356E+00 | loss scale: 4096.0 | grad norm: 5.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.403 | TFLOPs: 42.19 | +[default7]: iteration 1576/ 6200 | consumed samples: 1613824 | consumed tokens: 3305111552 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.886764E+00 | loss scale: 4096.0 | grad norm: 6.879 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.169 | TFLOPs: 42.12 | +[default7]: iteration 1577/ 6200 | consumed samples: 1614848 | consumed tokens: 3307208704 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.888703E+00 | loss scale: 4096.0 | grad norm: 7.018 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.423 | TFLOPs: 42.19 | +[default7]: iteration 1578/ 6200 | consumed samples: 1615872 | consumed tokens: 3309305856 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.903654E+00 | loss scale: 4096.0 | grad norm: 6.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.613 | TFLOPs: 42.25 | +[default7]: iteration 1579/ 6200 | consumed samples: 1616896 | consumed tokens: 3311403008 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.892867E+00 | loss scale: 4096.0 | grad norm: 6.865 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.641 | TFLOPs: 42.26 | +[default7]: iteration 1580/ 6200 | consumed samples: 1617920 | consumed tokens: 3313500160 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.897588E+00 | loss scale: 4096.0 | grad norm: 7.048 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.416 | TFLOPs: 42.19 | +[default7]: iteration 1581/ 6200 | consumed samples: 1618944 | consumed tokens: 3315597312 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.889382E+00 | loss scale: 4096.0 | grad norm: 8.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.387 | TFLOPs: 42.18 | +[default7]: iteration 1582/ 6200 | consumed samples: 1619968 | consumed tokens: 3317694464 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.907590E+00 | loss scale: 4096.0 | grad norm: 7.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.008 | TFLOPs: 42.07 | +[default7]: iteration 1583/ 6200 | consumed samples: 1620992 | consumed tokens: 3319791616 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.885824E+00 | loss scale: 4096.0 | grad norm: 6.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.154 | TFLOPs: 42.11 | +[default7]: iteration 1584/ 6200 | consumed samples: 1622016 | consumed tokens: 3321888768 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.910886E+00 | loss scale: 4096.0 | grad norm: 7.627 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.980 | TFLOPs: 42.06 | +[default7]: iteration 1585/ 6200 | consumed samples: 1623040 | consumed tokens: 3323985920 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.888720E+00 | loss scale: 4096.0 | grad norm: 6.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.330 | TFLOPs: 42.16 | +[default7]: iteration 1586/ 6200 | consumed samples: 1624064 | consumed tokens: 3326083072 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.899950E+00 | loss scale: 4096.0 | grad norm: 6.064 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.903 | TFLOPs: 42.03 | +[default7]: iteration 1587/ 6200 | consumed samples: 1625088 | consumed tokens: 3328180224 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.911010E+00 | loss scale: 4096.0 | grad norm: 6.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.229 | TFLOPs: 42.13 | +[default7]: iteration 1588/ 6200 | consumed samples: 1626112 | consumed tokens: 3330277376 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.887535E+00 | loss scale: 4096.0 | grad norm: 8.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.473 | TFLOPs: 42.21 | +[default7]: iteration 1589/ 6200 | consumed samples: 1627136 | consumed tokens: 3332374528 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.912265E+00 | loss scale: 4096.0 | grad norm: 7.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.224 | TFLOPs: 42.13 | +[default7]: iteration 1590/ 6200 | consumed samples: 1628160 | consumed tokens: 3334471680 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.905077E+00 | loss scale: 4096.0 | grad norm: 6.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.057 | TFLOPs: 42.08 | +[default7]: iteration 1591/ 6200 | consumed samples: 1629184 | consumed tokens: 3336568832 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.912807E+00 | loss scale: 4096.0 | grad norm: 6.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.686 | TFLOPs: 42.27 | +[default7]: iteration 1592/ 6200 | consumed samples: 1630208 | consumed tokens: 3338665984 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.889783E+00 | loss scale: 4096.0 | grad norm: 6.649 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.703 | TFLOPs: 42.28 | +[default7]: iteration 1593/ 6200 | consumed samples: 1631232 | consumed tokens: 3340763136 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.904823E+00 | loss scale: 4096.0 | grad norm: 5.910 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.873 | TFLOPs: 42.33 | +[default7]: iteration 1594/ 6200 | consumed samples: 1632256 | consumed tokens: 3342860288 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.880749E+00 | loss scale: 4096.0 | grad norm: 5.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.724 | TFLOPs: 42.28 | +[default7]: iteration 1595/ 6200 | consumed samples: 1633280 | consumed tokens: 3344957440 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895969E+00 | loss scale: 4096.0 | grad norm: 6.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.797 | TFLOPs: 42.31 | +[default7]: iteration 1596/ 6200 | consumed samples: 1634304 | consumed tokens: 3347054592 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.908887E+00 | loss scale: 4096.0 | grad norm: 5.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.807 | TFLOPs: 42.31 | +[default7]: iteration 1597/ 6200 | consumed samples: 1635328 | consumed tokens: 3349151744 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.897455E+00 | loss scale: 4096.0 | grad norm: 5.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.762 | TFLOPs: 42.30 | +[default7]: iteration 1598/ 6200 | consumed samples: 1636352 | consumed tokens: 3351248896 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.927389E+00 | loss scale: 4096.0 | grad norm: 6.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.754 | TFLOPs: 42.29 | +[default7]: iteration 1599/ 6200 | consumed samples: 1637376 | consumed tokens: 3353346048 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.904624E+00 | loss scale: 4096.0 | grad norm: 6.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.836 | TFLOPs: 42.32 | +[default7]: iteration 1600/ 6200 | consumed samples: 1638400 | consumed tokens: 3355443200 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.893145E+00 | loss scale: 4096.0 | grad norm: 8.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.773 | TFLOPs: 42.30 | +[default7]: iteration 1601/ 6200 | consumed samples: 1639424 | consumed tokens: 3357540352 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.897812E+00 | loss scale: 4096.0 | grad norm: 5.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.665 | TFLOPs: 42.27 | +[default7]: iteration 1602/ 6200 | consumed samples: 1640448 | consumed tokens: 3359637504 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895980E+00 | loss scale: 4096.0 | grad norm: 5.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.724 | TFLOPs: 42.28 | +[default7]: iteration 1603/ 6200 | consumed samples: 1641472 | consumed tokens: 3361734656 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.916210E+00 | loss scale: 4096.0 | grad norm: 5.742 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.842 | TFLOPs: 42.32 | +[default7]: iteration 1604/ 6200 | consumed samples: 1642496 | consumed tokens: 3363831808 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.916758E+00 | loss scale: 4096.0 | grad norm: 6.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.354 | TFLOPs: 42.17 | +[default7]: iteration 1605/ 6200 | consumed samples: 1643520 | consumed tokens: 3365928960 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.923501E+00 | loss scale: 4096.0 | grad norm: 6.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.470 | TFLOPs: 42.21 | +[default7]: iteration 1606/ 6200 | consumed samples: 1644544 | consumed tokens: 3368026112 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.932765E+00 | loss scale: 4096.0 | grad norm: 7.042 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.292 | TFLOPs: 42.15 | +[default7]: iteration 1607/ 6200 | consumed samples: 1645568 | consumed tokens: 3370123264 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895009E+00 | loss scale: 4096.0 | grad norm: 5.831 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.040 | TFLOPs: 42.08 | +[default7]: iteration 1608/ 6200 | consumed samples: 1646592 | consumed tokens: 3372220416 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884666E+00 | loss scale: 4096.0 | grad norm: 5.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.241 | TFLOPs: 42.14 | +[default7]: iteration 1609/ 6200 | consumed samples: 1647616 | consumed tokens: 3374317568 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.893058E+00 | loss scale: 4096.0 | grad norm: 7.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.262 | TFLOPs: 42.14 | +[default7]: iteration 1610/ 6200 | consumed samples: 1648640 | consumed tokens: 3376414720 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.904653E+00 | loss scale: 4096.0 | grad norm: 5.718 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.335 | TFLOPs: 42.17 | +[default7]: iteration 1611/ 6200 | consumed samples: 1649664 | consumed tokens: 3378511872 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.905709E+00 | loss scale: 4096.0 | grad norm: 5.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.394 | TFLOPs: 42.18 | +[default7]: iteration 1612/ 6200 | consumed samples: 1650688 | consumed tokens: 3380609024 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.901761E+00 | loss scale: 4096.0 | grad norm: 6.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.823 | TFLOPs: 42.01 | +[default7]: iteration 1613/ 6200 | consumed samples: 1651712 | consumed tokens: 3382706176 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.907679E+00 | loss scale: 4096.0 | grad norm: 5.790 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.163 | TFLOPs: 42.11 | +[default7]: iteration 1614/ 6200 | consumed samples: 1652736 | consumed tokens: 3384803328 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884981E+00 | loss scale: 4096.0 | grad norm: 5.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.642 | TFLOPs: 42.26 | +[default7]: iteration 1615/ 6200 | consumed samples: 1653760 | consumed tokens: 3386900480 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.882156E+00 | loss scale: 4096.0 | grad norm: 5.927 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.754 | TFLOPs: 42.29 | +[default7]: iteration 1616/ 6200 | consumed samples: 1654784 | consumed tokens: 3388997632 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.913942E+00 | loss scale: 4096.0 | grad norm: 5.894 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.873 | TFLOPs: 42.33 | +[default7]: iteration 1617/ 6200 | consumed samples: 1655808 | consumed tokens: 3391094784 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895640E+00 | loss scale: 4096.0 | grad norm: 8.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.805 | TFLOPs: 42.31 | +[default7]: iteration 1618/ 6200 | consumed samples: 1656832 | consumed tokens: 3393191936 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.886162E+00 | loss scale: 4096.0 | grad norm: 6.559 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.714 | TFLOPs: 42.28 | +[default7]: iteration 1619/ 6200 | consumed samples: 1657856 | consumed tokens: 3395289088 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.917989E+00 | loss scale: 4096.0 | grad norm: 5.866 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.186 | TFLOPs: 42.12 | +[default7]: iteration 1620/ 6200 | consumed samples: 1658880 | consumed tokens: 3397386240 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.874880E+00 | loss scale: 4096.0 | grad norm: 6.069 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.208 | TFLOPs: 42.13 | +[default7]: iteration 1621/ 6200 | consumed samples: 1659904 | consumed tokens: 3399483392 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.907780E+00 | loss scale: 4096.0 | grad norm: 7.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.396 | TFLOPs: 42.18 | +[default7]: iteration 1622/ 6200 | consumed samples: 1660928 | consumed tokens: 3401580544 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.875759E+00 | loss scale: 4096.0 | grad norm: 5.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.472 | TFLOPs: 42.21 | +[default7]: iteration 1623/ 6200 | consumed samples: 1661952 | consumed tokens: 3403677696 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.872621E+00 | loss scale: 4096.0 | grad norm: 7.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.262 | TFLOPs: 42.14 | +[default7]: iteration 1624/ 6200 | consumed samples: 1662976 | consumed tokens: 3405774848 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.896210E+00 | loss scale: 4096.0 | grad norm: 6.952 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.361 | TFLOPs: 42.17 | +[default0]:[2022-10-06 16:12:36,238] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[default7]: iteration 1625/ 6200 | consumed samples: 1664000 | consumed tokens: 3407872000 | elapsed time per iteration (s): 7.29 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895995E+00 | loss scale: 2048.0 | grad norm: 6.952 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 140.502 | TFLOPs: 42.83 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 1625 | lm loss value: 3.457815E+00 | lm loss PPL: 3.174754E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 1625 | lm loss value: 1.805579E+00 | lm loss PPL: 6.083492E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 1626/ 6200 | consumed samples: 1665024 | consumed tokens: 3409969152 | elapsed time per iteration (s): 51.94 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.897729E+00 | loss scale: 2048.0 | grad norm: 6.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.715 | TFLOPs: 6.01 | +[default7]: iteration 1627/ 6200 | consumed samples: 1666048 | consumed tokens: 3412066304 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.923204E+00 | loss scale: 2048.0 | grad norm: 5.857 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.945 | TFLOPs: 42.05 | +[default7]: iteration 1628/ 6200 | consumed samples: 1667072 | consumed tokens: 3414163456 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.893403E+00 | loss scale: 2048.0 | grad norm: 6.050 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.036 | TFLOPs: 42.07 | +[default7]: iteration 1629/ 6200 | consumed samples: 1668096 | consumed tokens: 3416260608 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.889330E+00 | loss scale: 2048.0 | grad norm: 6.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.286 | TFLOPs: 42.15 | +[default7]: iteration 1630/ 6200 | consumed samples: 1669120 | consumed tokens: 3418357760 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.885795E+00 | loss scale: 2048.0 | grad norm: 6.010 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.102 | TFLOPs: 42.09 | +[default7]: iteration 1631/ 6200 | consumed samples: 1670144 | consumed tokens: 3420454912 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.913652E+00 | loss scale: 2048.0 | grad norm: 8.017 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.081 | TFLOPs: 42.09 | +[default7]: iteration 1632/ 6200 | consumed samples: 1671168 | consumed tokens: 3422552064 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895751E+00 | loss scale: 2048.0 | grad norm: 7.857 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.082 | TFLOPs: 42.09 | +[default7]: iteration 1633/ 6200 | consumed samples: 1672192 | consumed tokens: 3424649216 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.877333E+00 | loss scale: 2048.0 | grad norm: 5.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.025 | TFLOPs: 42.07 | +[default7]: iteration 1634/ 6200 | consumed samples: 1673216 | consumed tokens: 3426746368 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.878134E+00 | loss scale: 2048.0 | grad norm: 7.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.963 | TFLOPs: 42.05 | +[default7]: iteration 1635/ 6200 | consumed samples: 1674240 | consumed tokens: 3428843520 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.903141E+00 | loss scale: 2048.0 | grad norm: 7.961 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.143 | TFLOPs: 42.11 | +[default7]: iteration 1636/ 6200 | consumed samples: 1675264 | consumed tokens: 3430940672 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.872409E+00 | loss scale: 2048.0 | grad norm: 6.813 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.284 | TFLOPs: 42.15 | +[default7]: iteration 1637/ 6200 | consumed samples: 1676288 | consumed tokens: 3433037824 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.925724E+00 | loss scale: 2048.0 | grad norm: 6.727 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.105 | TFLOPs: 42.10 | +[default7]: iteration 1638/ 6200 | consumed samples: 1677312 | consumed tokens: 3435134976 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.897399E+00 | loss scale: 2048.0 | grad norm: 6.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.123 | TFLOPs: 42.10 | +[default7]: iteration 1639/ 6200 | consumed samples: 1678336 | consumed tokens: 3437232128 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.880224E+00 | loss scale: 2048.0 | grad norm: 8.841 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.286 | TFLOPs: 42.15 | +[default7]: iteration 1640/ 6200 | consumed samples: 1679360 | consumed tokens: 3439329280 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895999E+00 | loss scale: 2048.0 | grad norm: 6.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.083 | TFLOPs: 42.09 | +[default7]: iteration 1641/ 6200 | consumed samples: 1680384 | consumed tokens: 3441426432 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.900764E+00 | loss scale: 2048.0 | grad norm: 5.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.544 | TFLOPs: 42.23 | +[default7]: iteration 1642/ 6200 | consumed samples: 1681408 | consumed tokens: 3443523584 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.901020E+00 | loss scale: 2048.0 | grad norm: 8.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.164 | TFLOPs: 42.11 | +[default7]: iteration 1643/ 6200 | consumed samples: 1682432 | consumed tokens: 3445620736 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.897726E+00 | loss scale: 2048.0 | grad norm: 8.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.823 | TFLOPs: 42.31 | +[default7]: iteration 1644/ 6200 | consumed samples: 1683456 | consumed tokens: 3447717888 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.894491E+00 | loss scale: 2048.0 | grad norm: 6.772 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.725 | TFLOPs: 42.28 | +[default7]: iteration 1645/ 6200 | consumed samples: 1684480 | consumed tokens: 3449815040 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.887596E+00 | loss scale: 2048.0 | grad norm: 6.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.707 | TFLOPs: 42.28 | +[default7]: iteration 1646/ 6200 | consumed samples: 1685504 | consumed tokens: 3451912192 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.910918E+00 | loss scale: 2048.0 | grad norm: 6.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.148 | TFLOPs: 42.11 | +[default7]: iteration 1647/ 6200 | consumed samples: 1686528 | consumed tokens: 3454009344 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.906741E+00 | loss scale: 2048.0 | grad norm: 6.071 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.293 | TFLOPs: 42.15 | +[default7]: iteration 1648/ 6200 | consumed samples: 1687552 | consumed tokens: 3456106496 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.891330E+00 | loss scale: 2048.0 | grad norm: 5.958 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.447 | TFLOPs: 42.20 | +[default7]: iteration 1649/ 6200 | consumed samples: 1688576 | consumed tokens: 3458203648 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.904778E+00 | loss scale: 2048.0 | grad norm: 5.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.150 | TFLOPs: 42.11 | +[default7]: iteration 1650/ 6200 | consumed samples: 1689600 | consumed tokens: 3460300800 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.898644E+00 | loss scale: 2048.0 | grad norm: 6.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.214 | TFLOPs: 42.13 | +[default7]: iteration 1651/ 6200 | consumed samples: 1690624 | consumed tokens: 3462397952 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.907994E+00 | loss scale: 2048.0 | grad norm: 5.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.305 | TFLOPs: 42.16 | +[default7]: iteration 1652/ 6200 | consumed samples: 1691648 | consumed tokens: 3464495104 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884445E+00 | loss scale: 2048.0 | grad norm: 5.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.916 | TFLOPs: 42.04 | +[default7]: iteration 1653/ 6200 | consumed samples: 1692672 | consumed tokens: 3466592256 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.894951E+00 | loss scale: 2048.0 | grad norm: 5.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.066 | TFLOPs: 42.08 | +[default7]: iteration 1654/ 6200 | consumed samples: 1693696 | consumed tokens: 3468689408 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.896689E+00 | loss scale: 2048.0 | grad norm: 6.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.351 | TFLOPs: 42.17 | +[default7]: iteration 1655/ 6200 | consumed samples: 1694720 | consumed tokens: 3470786560 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.913417E+00 | loss scale: 2048.0 | grad norm: 5.989 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.306 | TFLOPs: 42.16 | +[default7]: iteration 1656/ 6200 | consumed samples: 1695744 | consumed tokens: 3472883712 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.900710E+00 | loss scale: 2048.0 | grad norm: 6.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.601 | TFLOPs: 42.25 | +[default7]: iteration 1657/ 6200 | consumed samples: 1696768 | consumed tokens: 3474980864 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.901186E+00 | loss scale: 2048.0 | grad norm: 6.002 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.188 | TFLOPs: 42.12 | +[default7]: iteration 1658/ 6200 | consumed samples: 1697792 | consumed tokens: 3477078016 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.899646E+00 | loss scale: 2048.0 | grad norm: 5.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.196 | TFLOPs: 42.12 | +[default7]: iteration 1659/ 6200 | consumed samples: 1698816 | consumed tokens: 3479175168 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.936172E+00 | loss scale: 2048.0 | grad norm: 7.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.703 | TFLOPs: 42.28 | +[default7]: iteration 1660/ 6200 | consumed samples: 1699840 | consumed tokens: 3481272320 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.864451E+00 | loss scale: 2048.0 | grad norm: 5.814 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.125 | TFLOPs: 42.10 | +[default7]: iteration 1661/ 6200 | consumed samples: 1700864 | consumed tokens: 3483369472 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.893885E+00 | loss scale: 2048.0 | grad norm: 5.787 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.535 | TFLOPs: 42.23 | +[default7]: iteration 1662/ 6200 | consumed samples: 1701888 | consumed tokens: 3485466624 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.894820E+00 | loss scale: 2048.0 | grad norm: 6.014 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.186 | TFLOPs: 42.12 | +[default7]: iteration 1663/ 6200 | consumed samples: 1702912 | consumed tokens: 3487563776 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.889749E+00 | loss scale: 2048.0 | grad norm: 5.992 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.154 | TFLOPs: 42.11 | +[default7]: iteration 1664/ 6200 | consumed samples: 1703936 | consumed tokens: 3489660928 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884659E+00 | loss scale: 2048.0 | grad norm: 5.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.206 | TFLOPs: 42.13 | +[default7]: iteration 1665/ 6200 | consumed samples: 1704960 | consumed tokens: 3491758080 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.886198E+00 | loss scale: 2048.0 | grad norm: 5.972 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.652 | TFLOPs: 42.26 | +[default7]: iteration 1666/ 6200 | consumed samples: 1705984 | consumed tokens: 3493855232 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884279E+00 | loss scale: 2048.0 | grad norm: 5.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.789 | TFLOPs: 42.30 | +[default7]: iteration 1667/ 6200 | consumed samples: 1707008 | consumed tokens: 3495952384 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.898234E+00 | loss scale: 2048.0 | grad norm: 6.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.844 | TFLOPs: 42.02 | +[default7]: iteration 1668/ 6200 | consumed samples: 1708032 | consumed tokens: 3498049536 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.881390E+00 | loss scale: 2048.0 | grad norm: 6.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.245 | TFLOPs: 42.14 | +[default7]: iteration 1669/ 6200 | consumed samples: 1709056 | consumed tokens: 3500146688 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.881169E+00 | loss scale: 2048.0 | grad norm: 5.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.638 | TFLOPs: 42.26 | +[default7]: iteration 1670/ 6200 | consumed samples: 1710080 | consumed tokens: 3502243840 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.870183E+00 | loss scale: 2048.0 | grad norm: 5.629 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.666 | TFLOPs: 42.27 | +[default7]: iteration 1671/ 6200 | consumed samples: 1711104 | consumed tokens: 3504340992 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.897884E+00 | loss scale: 2048.0 | grad norm: 5.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.159 | TFLOPs: 42.11 | +[default7]: iteration 1672/ 6200 | consumed samples: 1712128 | consumed tokens: 3506438144 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895701E+00 | loss scale: 2048.0 | grad norm: 5.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.321 | TFLOPs: 42.16 | +[default7]: iteration 1673/ 6200 | consumed samples: 1713152 | consumed tokens: 3508535296 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.896179E+00 | loss scale: 2048.0 | grad norm: 6.004 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.667 | TFLOPs: 42.27 | +[default7]: iteration 1674/ 6200 | consumed samples: 1714176 | consumed tokens: 3510632448 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.878224E+00 | loss scale: 2048.0 | grad norm: 5.683 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.216 | TFLOPs: 42.13 | +[default7]: iteration 1675/ 6200 | consumed samples: 1715200 | consumed tokens: 3512729600 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.915969E+00 | loss scale: 2048.0 | grad norm: 5.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.796 | TFLOPs: 42.31 | +[default7]: iteration 1676/ 6200 | consumed samples: 1716224 | consumed tokens: 3514826752 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.885828E+00 | loss scale: 2048.0 | grad norm: 5.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.496 | TFLOPs: 42.21 | +[default7]: iteration 1677/ 6200 | consumed samples: 1717248 | consumed tokens: 3516923904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.902912E+00 | loss scale: 2048.0 | grad norm: 5.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.763 | TFLOPs: 42.30 | +[default7]: iteration 1678/ 6200 | consumed samples: 1718272 | consumed tokens: 3519021056 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.872646E+00 | loss scale: 2048.0 | grad norm: 6.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.484 | TFLOPs: 42.21 | +[default7]: iteration 1679/ 6200 | consumed samples: 1719296 | consumed tokens: 3521118208 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.894177E+00 | loss scale: 2048.0 | grad norm: 5.658 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.831 | TFLOPs: 42.32 | +[default7]: iteration 1680/ 6200 | consumed samples: 1720320 | consumed tokens: 3523215360 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.882878E+00 | loss scale: 2048.0 | grad norm: 6.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.643 | TFLOPs: 42.26 | +[default7]: iteration 1681/ 6200 | consumed samples: 1721344 | consumed tokens: 3525312512 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.900262E+00 | loss scale: 2048.0 | grad norm: 6.745 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.828 | TFLOPs: 42.32 | +[default7]: iteration 1682/ 6200 | consumed samples: 1722368 | consumed tokens: 3527409664 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.907522E+00 | loss scale: 2048.0 | grad norm: 6.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.664 | TFLOPs: 42.27 | +[default7]: iteration 1683/ 6200 | consumed samples: 1723392 | consumed tokens: 3529506816 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.881819E+00 | loss scale: 2048.0 | grad norm: 5.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.570 | TFLOPs: 42.24 | +[default7]: iteration 1684/ 6200 | consumed samples: 1724416 | consumed tokens: 3531603968 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.916689E+00 | loss scale: 2048.0 | grad norm: 7.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.741 | TFLOPs: 42.29 | +[default7]: iteration 1685/ 6200 | consumed samples: 1725440 | consumed tokens: 3533701120 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.870823E+00 | loss scale: 2048.0 | grad norm: 6.704 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.465 | TFLOPs: 42.21 | +[default7]: iteration 1686/ 6200 | consumed samples: 1726464 | consumed tokens: 3535798272 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.878284E+00 | loss scale: 2048.0 | grad norm: 5.039 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.780 | TFLOPs: 42.30 | +[default7]: iteration 1687/ 6200 | consumed samples: 1727488 | consumed tokens: 3537895424 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884059E+00 | loss scale: 2048.0 | grad norm: 6.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.431 | TFLOPs: 42.20 | +[default7]: iteration 1688/ 6200 | consumed samples: 1728512 | consumed tokens: 3539992576 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.879380E+00 | loss scale: 2048.0 | grad norm: 5.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.550 | TFLOPs: 42.23 | +[default7]: iteration 1689/ 6200 | consumed samples: 1729536 | consumed tokens: 3542089728 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.890630E+00 | loss scale: 2048.0 | grad norm: 5.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.528 | TFLOPs: 42.22 | +[default7]: iteration 1690/ 6200 | consumed samples: 1730560 | consumed tokens: 3544186880 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.898462E+00 | loss scale: 2048.0 | grad norm: 5.792 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.614 | TFLOPs: 42.25 | +[default7]: iteration 1691/ 6200 | consumed samples: 1731584 | consumed tokens: 3546284032 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.862961E+00 | loss scale: 2048.0 | grad norm: 5.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.722 | TFLOPs: 42.28 | +[default7]: iteration 1692/ 6200 | consumed samples: 1732608 | consumed tokens: 3548381184 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.880293E+00 | loss scale: 2048.0 | grad norm: 6.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.714 | TFLOPs: 42.28 | +[default7]: iteration 1693/ 6200 | consumed samples: 1733632 | consumed tokens: 3550478336 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.878425E+00 | loss scale: 2048.0 | grad norm: 6.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.639 | TFLOPs: 41.95 | +[default7]: iteration 1694/ 6200 | consumed samples: 1734656 | consumed tokens: 3552575488 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884168E+00 | loss scale: 2048.0 | grad norm: 7.810 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.786 | TFLOPs: 42.30 | +[default7]: iteration 1695/ 6200 | consumed samples: 1735680 | consumed tokens: 3554672640 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.903674E+00 | loss scale: 2048.0 | grad norm: 6.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.982 | TFLOPs: 42.36 | +[default7]: iteration 1696/ 6200 | consumed samples: 1736704 | consumed tokens: 3556769792 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.892469E+00 | loss scale: 2048.0 | grad norm: 6.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.114 | TFLOPs: 42.40 | +[default7]: iteration 1697/ 6200 | consumed samples: 1737728 | consumed tokens: 3558866944 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.891492E+00 | loss scale: 2048.0 | grad norm: 5.864 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.537 | TFLOPs: 42.23 | +[default7]: iteration 1698/ 6200 | consumed samples: 1738752 | consumed tokens: 3560964096 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.892325E+00 | loss scale: 2048.0 | grad norm: 6.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.175 | TFLOPs: 42.42 | +[default7]: iteration 1699/ 6200 | consumed samples: 1739776 | consumed tokens: 3563061248 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.882366E+00 | loss scale: 2048.0 | grad norm: 7.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.070 | TFLOPs: 42.39 | +[default7]: iteration 1700/ 6200 | consumed samples: 1740800 | consumed tokens: 3565158400 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.889112E+00 | loss scale: 2048.0 | grad norm: 6.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.198 | TFLOPs: 42.43 | +[default7]: iteration 1701/ 6200 | consumed samples: 1741824 | consumed tokens: 3567255552 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.907132E+00 | loss scale: 2048.0 | grad norm: 6.738 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.969 | TFLOPs: 42.36 | +[default7]: iteration 1702/ 6200 | consumed samples: 1742848 | consumed tokens: 3569352704 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.892998E+00 | loss scale: 2048.0 | grad norm: 7.075 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.998 | TFLOPs: 42.37 | +[default7]: iteration 1703/ 6200 | consumed samples: 1743872 | consumed tokens: 3571449856 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.871507E+00 | loss scale: 2048.0 | grad norm: 7.022 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.093 | TFLOPs: 42.40 | +[default7]: iteration 1704/ 6200 | consumed samples: 1744896 | consumed tokens: 3573547008 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.881210E+00 | loss scale: 2048.0 | grad norm: 5.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.641 | TFLOPs: 42.26 | +[default7]: iteration 1705/ 6200 | consumed samples: 1745920 | consumed tokens: 3575644160 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895544E+00 | loss scale: 2048.0 | grad norm: 5.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.756 | TFLOPs: 42.29 | +[default7]: iteration 1706/ 6200 | consumed samples: 1746944 | consumed tokens: 3577741312 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.911757E+00 | loss scale: 2048.0 | grad norm: 6.806 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.931 | TFLOPs: 42.35 | +[default7]: iteration 1707/ 6200 | consumed samples: 1747968 | consumed tokens: 3579838464 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.901236E+00 | loss scale: 2048.0 | grad norm: 5.842 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.860 | TFLOPs: 42.02 | +[default7]: iteration 1708/ 6200 | consumed samples: 1748992 | consumed tokens: 3581935616 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.875890E+00 | loss scale: 2048.0 | grad norm: 5.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.005 | TFLOPs: 42.07 | +[default7]: iteration 1709/ 6200 | consumed samples: 1750016 | consumed tokens: 3584032768 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.886341E+00 | loss scale: 2048.0 | grad norm: 5.945 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.775 | TFLOPs: 42.00 | +[default7]: iteration 1710/ 6200 | consumed samples: 1751040 | consumed tokens: 3586129920 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.905529E+00 | loss scale: 2048.0 | grad norm: 6.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.543 | TFLOPs: 42.23 | +[default7]: iteration 1711/ 6200 | consumed samples: 1752064 | consumed tokens: 3588227072 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.879200E+00 | loss scale: 2048.0 | grad norm: 5.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.596 | TFLOPs: 41.94 | +[default7]: iteration 1712/ 6200 | consumed samples: 1753088 | consumed tokens: 3590324224 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.917366E+00 | loss scale: 2048.0 | grad norm: 5.875 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.009 | TFLOPs: 42.07 | +[default7]: iteration 1713/ 6200 | consumed samples: 1754112 | consumed tokens: 3592421376 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884186E+00 | loss scale: 2048.0 | grad norm: 5.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.498 | TFLOPs: 41.91 | +[default7]: iteration 1714/ 6200 | consumed samples: 1755136 | consumed tokens: 3594518528 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.873279E+00 | loss scale: 2048.0 | grad norm: 6.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.099 | TFLOPs: 42.09 | +[default7]: iteration 1715/ 6200 | consumed samples: 1756160 | consumed tokens: 3596615680 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.870810E+00 | loss scale: 2048.0 | grad norm: 5.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.693 | TFLOPs: 41.97 | +[default7]: iteration 1716/ 6200 | consumed samples: 1757184 | consumed tokens: 3598712832 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.891155E+00 | loss scale: 2048.0 | grad norm: 5.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.540 | TFLOPs: 41.92 | +[default7]: iteration 1717/ 6200 | consumed samples: 1758208 | consumed tokens: 3600809984 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.893074E+00 | loss scale: 2048.0 | grad norm: 6.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.551 | TFLOPs: 41.93 | +[default7]: iteration 1718/ 6200 | consumed samples: 1759232 | consumed tokens: 3602907136 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.911572E+00 | loss scale: 2048.0 | grad norm: 5.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.587 | TFLOPs: 41.94 | +[default7]: iteration 1719/ 6200 | consumed samples: 1760256 | consumed tokens: 3605004288 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.879051E+00 | loss scale: 2048.0 | grad norm: 6.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.253 | TFLOPs: 42.14 | +[default7]: iteration 1720/ 6200 | consumed samples: 1761280 | consumed tokens: 3607101440 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.865918E+00 | loss scale: 2048.0 | grad norm: 6.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.685 | TFLOPs: 41.97 | +[default7]: iteration 1721/ 6200 | consumed samples: 1762304 | consumed tokens: 3609198592 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.885437E+00 | loss scale: 2048.0 | grad norm: 5.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.876 | TFLOPs: 42.03 | +[default7]: iteration 1722/ 6200 | consumed samples: 1763328 | consumed tokens: 3611295744 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.878468E+00 | loss scale: 2048.0 | grad norm: 8.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.832 | TFLOPs: 42.01 | +[default7]: iteration 1723/ 6200 | consumed samples: 1764352 | consumed tokens: 3613392896 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.901664E+00 | loss scale: 2048.0 | grad norm: 6.704 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.399 | TFLOPs: 41.88 | +[default7]: iteration 1724/ 6200 | consumed samples: 1765376 | consumed tokens: 3615490048 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.892989E+00 | loss scale: 2048.0 | grad norm: 6.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.200 | TFLOPs: 42.12 | +[default7]: iteration 1725/ 6200 | consumed samples: 1766400 | consumed tokens: 3617587200 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884512E+00 | loss scale: 2048.0 | grad norm: 6.830 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.906 | TFLOPs: 42.03 | +[default7]: iteration 1726/ 6200 | consumed samples: 1767424 | consumed tokens: 3619684352 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.894256E+00 | loss scale: 2048.0 | grad norm: 8.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.566 | TFLOPs: 41.93 | +[default7]: iteration 1727/ 6200 | consumed samples: 1768448 | consumed tokens: 3621781504 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.876626E+00 | loss scale: 2048.0 | grad norm: 6.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.802 | TFLOPs: 41.70 | +[default7]: iteration 1728/ 6200 | consumed samples: 1769472 | consumed tokens: 3623878656 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.887045E+00 | loss scale: 2048.0 | grad norm: 7.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.802 | TFLOPs: 42.00 | +[default7]: iteration 1729/ 6200 | consumed samples: 1770496 | consumed tokens: 3625975808 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.896108E+00 | loss scale: 2048.0 | grad norm: 5.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.017 | TFLOPs: 42.07 | +[default7]: iteration 1730/ 6200 | consumed samples: 1771520 | consumed tokens: 3628072960 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.894607E+00 | loss scale: 2048.0 | grad norm: 6.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.250 | TFLOPs: 42.14 | +[default7]: iteration 1731/ 6200 | consumed samples: 1772544 | consumed tokens: 3630170112 | elapsed time per iteration (s): 7.51 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.885141E+00 | loss scale: 2048.0 | grad norm: 6.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.430 | TFLOPs: 41.59 | +[default7]: iteration 1732/ 6200 | consumed samples: 1773568 | consumed tokens: 3632267264 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.889188E+00 | loss scale: 2048.0 | grad norm: 6.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.651 | TFLOPs: 41.96 | +[default7]: iteration 1733/ 6200 | consumed samples: 1774592 | consumed tokens: 3634364416 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895411E+00 | loss scale: 2048.0 | grad norm: 6.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.778 | TFLOPs: 42.00 | +[default7]: iteration 1734/ 6200 | consumed samples: 1775616 | consumed tokens: 3636461568 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.891316E+00 | loss scale: 2048.0 | grad norm: 7.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.856 | TFLOPs: 42.02 | +[default7]: iteration 1735/ 6200 | consumed samples: 1776640 | consumed tokens: 3638558720 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.879727E+00 | loss scale: 2048.0 | grad norm: 7.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.599 | TFLOPs: 41.64 | +[default7]: iteration 1736/ 6200 | consumed samples: 1777664 | consumed tokens: 3640655872 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.876509E+00 | loss scale: 2048.0 | grad norm: 6.067 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.376 | TFLOPs: 41.87 | +[default7]: iteration 1737/ 6200 | consumed samples: 1778688 | consumed tokens: 3642753024 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.890555E+00 | loss scale: 2048.0 | grad norm: 6.542 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.010 | TFLOPs: 41.76 | +[default7]: iteration 1738/ 6200 | consumed samples: 1779712 | consumed tokens: 3644850176 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.879035E+00 | loss scale: 2048.0 | grad norm: 6.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.650 | TFLOPs: 41.96 | +[default7]: iteration 1739/ 6200 | consumed samples: 1780736 | consumed tokens: 3646947328 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.864245E+00 | loss scale: 2048.0 | grad norm: 5.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.654 | TFLOPs: 41.65 | +[default7]: iteration 1740/ 6200 | consumed samples: 1781760 | consumed tokens: 3649044480 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.871492E+00 | loss scale: 2048.0 | grad norm: 6.754 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.195 | TFLOPs: 41.82 | +[default7]: iteration 1741/ 6200 | consumed samples: 1782784 | consumed tokens: 3651141632 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.879578E+00 | loss scale: 2048.0 | grad norm: 5.918 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.317 | TFLOPs: 41.86 | +[default7]: iteration 1742/ 6200 | consumed samples: 1783808 | consumed tokens: 3653238784 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.907524E+00 | loss scale: 2048.0 | grad norm: 5.901 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.821 | TFLOPs: 42.31 | +[default7]: iteration 1743/ 6200 | consumed samples: 1784832 | consumed tokens: 3655335936 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.861587E+00 | loss scale: 2048.0 | grad norm: 5.929 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.009 | TFLOPs: 42.37 | +[default7]: iteration 1744/ 6200 | consumed samples: 1785856 | consumed tokens: 3657433088 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.854185E+00 | loss scale: 2048.0 | grad norm: 6.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.167 | TFLOPs: 42.42 | +[default7]: iteration 1745/ 6200 | consumed samples: 1786880 | consumed tokens: 3659530240 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.875385E+00 | loss scale: 2048.0 | grad norm: 6.074 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.321 | TFLOPs: 42.47 | +[default7]: iteration 1746/ 6200 | consumed samples: 1787904 | consumed tokens: 3661627392 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.888357E+00 | loss scale: 2048.0 | grad norm: 5.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.429 | TFLOPs: 42.19 | +[default7]: iteration 1747/ 6200 | consumed samples: 1788928 | consumed tokens: 3663724544 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.892198E+00 | loss scale: 2048.0 | grad norm: 6.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.103 | TFLOPs: 42.40 | +[default7]: iteration 1748/ 6200 | consumed samples: 1789952 | consumed tokens: 3665821696 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.863677E+00 | loss scale: 2048.0 | grad norm: 5.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.112 | TFLOPs: 42.40 | +[default7]: iteration 1749/ 6200 | consumed samples: 1790976 | consumed tokens: 3667918848 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.861029E+00 | loss scale: 2048.0 | grad norm: 5.938 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.233 | TFLOPs: 42.44 | +[default7]: iteration 1750/ 6200 | consumed samples: 1792000 | consumed tokens: 3670016000 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.897183E+00 | loss scale: 2048.0 | grad norm: 6.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.898 | TFLOPs: 42.34 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 1750 | lm loss value: 3.465392E+00 | lm loss PPL: 3.198899E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 1750 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 16:29:30,996] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1750 is begin to save! +[default0]:[2022-10-06 16:29:31,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_01-model_00-model_states.pt... +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 1750 | lm loss value: 1.785315E+00 | lm loss PPL: 5.961456E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:[2022-10-06 16:29:31,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,457] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,488] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,570] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,623] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,679] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,705] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,759] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,786] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,813] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,868] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,921] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:31,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:31,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:32,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:32,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:32,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:32,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:32,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:32,062] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 16:29:32,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 16:29:32,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:32,133] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:32,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:32,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 16:29:32,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 16:29:32,162] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/mp_rank_00_model_states.pt +[default0]:[2022-10-06 16:29:32,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 16:29:32,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 16:29:32,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 16:29:32,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 16:29:32,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 16:29:32,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 16:29:32,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 16:29:32,178] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 16:29:32,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 16:29:32,363] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 16:29:32,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 16:29:32,345] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 16:29:32,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 16:29:32,369] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 16:29:32,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 16:29:32,379] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 16:29:32,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 16:29:32,374] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 16:29:32,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 16:29:32,446] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 16:29:32,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 16:29:32,447] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 16:29:32,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 16:29:32,410] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 16:29:32,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 16:29:32,449] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 16:29:32,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 16:29:32,465] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 16:29:32,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 16:29:32,442] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 16:29:32,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 16:29:32,414] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 16:29:32,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 16:29:32,424] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 16:29:32,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 16:29:32,492] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 16:29:32,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 16:29:32,430] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 16:29:32,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 16:29:32,430] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 16:29:32,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 16:29:32,452] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 16:29:32,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 16:29:32,476] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 16:29:32,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 16:29:32,490] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 16:29:32,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 16:29:32,475] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 16:29:32,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 16:29:32,436] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 16:29:32,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 16:29:32,460] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 16:29:32,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 16:29:32,459] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default6]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default7]:[2022-10-06 16:29:32,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 16:29:32,462] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default5]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default4]:[2022-10-06 16:29:32,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 16:29:32,531] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default3]:[2022-10-06 16:29:32,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 16:29:32,482] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default6]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default2]:[2022-10-06 16:29:32,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 16:29:32,481] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default6]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default0]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default7]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default0]:[2022-10-06 16:29:32,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 16:29:32,491] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default1]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default1]:[2022-10-06 16:29:32,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 16:29:32,499] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default3]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default5]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default2]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default1]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default6]:[2022-10-06 16:29:32,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 16:29:32,533] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default2]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default3]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default5]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 16:29:32,542] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default4]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default2]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default0]:[2022-10-06 16:29:32,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 16:29:32,538] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step1750/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default0]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default0]: successfully saved checkpoint at iteration 1750 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default1]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default4]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default7]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default7]:time (ms) | save-checkpoint: 1547.85 +[default4]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default7]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default5]:[2022-10-06 16:29:32,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1750 is ready now! +[default7]: iteration 1751/ 6200 | consumed samples: 1793024 | consumed tokens: 3672113152 | elapsed time per iteration (s): 53.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.862718E+00 | loss scale: 2048.0 | grad norm: 5.893 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.166 | TFLOPs: 5.84 | +[default7]: iteration 1752/ 6200 | consumed samples: 1794048 | consumed tokens: 3674210304 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.872702E+00 | loss scale: 2048.0 | grad norm: 7.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.182 | TFLOPs: 41.81 | +[default7]: iteration 1753/ 6200 | consumed samples: 1795072 | consumed tokens: 3676307456 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.875060E+00 | loss scale: 2048.0 | grad norm: 5.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.309 | TFLOPs: 42.16 | +[default7]: iteration 1754/ 6200 | consumed samples: 1796096 | consumed tokens: 3678404608 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.887850E+00 | loss scale: 2048.0 | grad norm: 5.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.157 | TFLOPs: 42.11 | +[default7]: iteration 1755/ 6200 | consumed samples: 1797120 | consumed tokens: 3680501760 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895647E+00 | loss scale: 2048.0 | grad norm: 5.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.305 | TFLOPs: 42.16 | +[default7]: iteration 1756/ 6200 | consumed samples: 1798144 | consumed tokens: 3682598912 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.901983E+00 | loss scale: 2048.0 | grad norm: 5.943 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.255 | TFLOPs: 42.14 | +[default7]: iteration 1757/ 6200 | consumed samples: 1799168 | consumed tokens: 3684696064 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.886047E+00 | loss scale: 2048.0 | grad norm: 5.815 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.252 | TFLOPs: 42.14 | +[default7]: iteration 1758/ 6200 | consumed samples: 1800192 | consumed tokens: 3686793216 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.890653E+00 | loss scale: 2048.0 | grad norm: 5.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.149 | TFLOPs: 42.11 | +[default7]: iteration 1759/ 6200 | consumed samples: 1801216 | consumed tokens: 3688890368 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884318E+00 | loss scale: 2048.0 | grad norm: 6.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.824 | TFLOPs: 42.01 | +[default7]: iteration 1760/ 6200 | consumed samples: 1802240 | consumed tokens: 3690987520 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.886507E+00 | loss scale: 2048.0 | grad norm: 6.031 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.652 | TFLOPs: 41.96 | +[default7]: iteration 1761/ 6200 | consumed samples: 1803264 | consumed tokens: 3693084672 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.854530E+00 | loss scale: 2048.0 | grad norm: 6.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.890 | TFLOPs: 42.03 | +[default7]: iteration 1762/ 6200 | consumed samples: 1804288 | consumed tokens: 3695181824 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895380E+00 | loss scale: 2048.0 | grad norm: 6.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.657 | TFLOPs: 41.96 | +[default7]: iteration 1763/ 6200 | consumed samples: 1805312 | consumed tokens: 3697278976 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.868620E+00 | loss scale: 2048.0 | grad norm: 7.553 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.676 | TFLOPs: 41.96 | +[default7]: iteration 1764/ 6200 | consumed samples: 1806336 | consumed tokens: 3699376128 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.867300E+00 | loss scale: 2048.0 | grad norm: 7.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.525 | TFLOPs: 42.22 | +[default7]: iteration 1765/ 6200 | consumed samples: 1807360 | consumed tokens: 3701473280 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.881921E+00 | loss scale: 2048.0 | grad norm: 6.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.439 | TFLOPs: 42.20 | +[default7]: iteration 1766/ 6200 | consumed samples: 1808384 | consumed tokens: 3703570432 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.872833E+00 | loss scale: 2048.0 | grad norm: 5.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.643 | TFLOPs: 42.26 | +[default7]: iteration 1767/ 6200 | consumed samples: 1809408 | consumed tokens: 3705667584 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.890755E+00 | loss scale: 2048.0 | grad norm: 5.589 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.806 | TFLOPs: 42.31 | +[default7]: iteration 1768/ 6200 | consumed samples: 1810432 | consumed tokens: 3707764736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895294E+00 | loss scale: 2048.0 | grad norm: 6.775 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.483 | TFLOPs: 42.21 | +[default7]: iteration 1769/ 6200 | consumed samples: 1811456 | consumed tokens: 3709861888 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.874214E+00 | loss scale: 2048.0 | grad norm: 6.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.398 | TFLOPs: 42.18 | +[default7]: iteration 1770/ 6200 | consumed samples: 1812480 | consumed tokens: 3711959040 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.853729E+00 | loss scale: 2048.0 | grad norm: 6.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.502 | TFLOPs: 42.22 | +[default7]: iteration 1771/ 6200 | consumed samples: 1813504 | consumed tokens: 3714056192 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.896466E+00 | loss scale: 2048.0 | grad norm: 5.894 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.159 | TFLOPs: 42.11 | +[default7]: iteration 1772/ 6200 | consumed samples: 1814528 | consumed tokens: 3716153344 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.892811E+00 | loss scale: 2048.0 | grad norm: 5.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.685 | TFLOPs: 41.97 | +[default7]: iteration 1773/ 6200 | consumed samples: 1815552 | consumed tokens: 3718250496 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.860734E+00 | loss scale: 2048.0 | grad norm: 6.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.997 | TFLOPs: 42.06 | +[default7]: iteration 1774/ 6200 | consumed samples: 1816576 | consumed tokens: 3720347648 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.875603E+00 | loss scale: 2048.0 | grad norm: 6.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.359 | TFLOPs: 42.17 | +[default7]: iteration 1775/ 6200 | consumed samples: 1817600 | consumed tokens: 3722444800 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.905426E+00 | loss scale: 2048.0 | grad norm: 5.766 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.396 | TFLOPs: 42.18 | +[default7]: iteration 1776/ 6200 | consumed samples: 1818624 | consumed tokens: 3724541952 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.856539E+00 | loss scale: 2048.0 | grad norm: 7.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.084 | TFLOPs: 42.09 | +[default7]: iteration 1777/ 6200 | consumed samples: 1819648 | consumed tokens: 3726639104 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.885623E+00 | loss scale: 2048.0 | grad norm: 5.839 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.381 | TFLOPs: 42.18 | +[default7]: iteration 1778/ 6200 | consumed samples: 1820672 | consumed tokens: 3728736256 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.885151E+00 | loss scale: 2048.0 | grad norm: 5.705 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.476 | TFLOPs: 42.21 | +[default7]: iteration 1779/ 6200 | consumed samples: 1821696 | consumed tokens: 3730833408 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.875782E+00 | loss scale: 2048.0 | grad norm: 6.958 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.948 | TFLOPs: 42.05 | +[default7]: iteration 1780/ 6200 | consumed samples: 1822720 | consumed tokens: 3732930560 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.864484E+00 | loss scale: 2048.0 | grad norm: 5.704 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.076 | TFLOPs: 42.09 | +[default7]: iteration 1781/ 6200 | consumed samples: 1823744 | consumed tokens: 3735027712 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.896346E+00 | loss scale: 2048.0 | grad norm: 5.977 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.093 | TFLOPs: 42.09 | +[default7]: iteration 1782/ 6200 | consumed samples: 1824768 | consumed tokens: 3737124864 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.861977E+00 | loss scale: 2048.0 | grad norm: 6.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.464 | TFLOPs: 42.21 | +[default7]: iteration 1783/ 6200 | consumed samples: 1825792 | consumed tokens: 3739222016 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.881235E+00 | loss scale: 2048.0 | grad norm: 5.992 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.109 | TFLOPs: 42.10 | +[default7]: iteration 1784/ 6200 | consumed samples: 1826816 | consumed tokens: 3741319168 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.871240E+00 | loss scale: 2048.0 | grad norm: 6.070 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.296 | TFLOPs: 42.15 | +[default7]: iteration 1785/ 6200 | consumed samples: 1827840 | consumed tokens: 3743416320 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.885774E+00 | loss scale: 2048.0 | grad norm: 5.813 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.331 | TFLOPs: 42.16 | +[default7]: iteration 1786/ 6200 | consumed samples: 1828864 | consumed tokens: 3745513472 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.878153E+00 | loss scale: 2048.0 | grad norm: 7.990 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.006 | TFLOPs: 42.07 | +[default7]: iteration 1787/ 6200 | consumed samples: 1829888 | consumed tokens: 3747610624 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.850223E+00 | loss scale: 2048.0 | grad norm: 6.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.154 | TFLOPs: 42.11 | +[default7]: iteration 1788/ 6200 | consumed samples: 1830912 | consumed tokens: 3749707776 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.880187E+00 | loss scale: 2048.0 | grad norm: 6.079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.177 | TFLOPs: 42.12 | +[default7]: iteration 1789/ 6200 | consumed samples: 1831936 | consumed tokens: 3751804928 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.861715E+00 | loss scale: 2048.0 | grad norm: 6.016 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.968 | TFLOPs: 42.05 | +[default7]: iteration 1790/ 6200 | consumed samples: 1832960 | consumed tokens: 3753902080 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.877787E+00 | loss scale: 2048.0 | grad norm: 5.705 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.814 | TFLOPs: 42.01 | +[default7]: iteration 1791/ 6200 | consumed samples: 1833984 | consumed tokens: 3755999232 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.878171E+00 | loss scale: 2048.0 | grad norm: 6.067 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.015 | TFLOPs: 42.07 | +[default0]:[2022-10-06 16:34:43,728] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 +[default7]: iteration 1792/ 6200 | consumed samples: 1835008 | consumed tokens: 3758096384 | elapsed time per iteration (s): 7.29 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.860061E+00 | loss scale: 1024.0 | grad norm: 6.067 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 140.553 | TFLOPs: 42.84 | +[default7]: iteration 1793/ 6200 | consumed samples: 1836032 | consumed tokens: 3760193536 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.860304E+00 | loss scale: 1024.0 | grad norm: 6.801 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.364 | TFLOPs: 42.17 | +[default7]: iteration 1794/ 6200 | consumed samples: 1837056 | consumed tokens: 3762290688 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.889565E+00 | loss scale: 1024.0 | grad norm: 6.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.302 | TFLOPs: 42.16 | +[default7]: iteration 1795/ 6200 | consumed samples: 1838080 | consumed tokens: 3764387840 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.879525E+00 | loss scale: 1024.0 | grad norm: 6.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.576 | TFLOPs: 42.24 | +[default7]: iteration 1796/ 6200 | consumed samples: 1839104 | consumed tokens: 3766484992 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.878012E+00 | loss scale: 1024.0 | grad norm: 5.749 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.773 | TFLOPs: 42.30 | +[default7]: iteration 1797/ 6200 | consumed samples: 1840128 | consumed tokens: 3768582144 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.880695E+00 | loss scale: 1024.0 | grad norm: 6.699 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.434 | TFLOPs: 42.20 | +[default7]: iteration 1798/ 6200 | consumed samples: 1841152 | consumed tokens: 3770679296 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.874455E+00 | loss scale: 1024.0 | grad norm: 6.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.137 | TFLOPs: 42.11 | +[default7]: iteration 1799/ 6200 | consumed samples: 1842176 | consumed tokens: 3772776448 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.869175E+00 | loss scale: 1024.0 | grad norm: 4.769 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.236 | TFLOPs: 42.14 | +[default7]: iteration 1800/ 6200 | consumed samples: 1843200 | consumed tokens: 3774873600 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847043E+00 | loss scale: 1024.0 | grad norm: 6.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.023 | TFLOPs: 42.07 | +[default7]: iteration 1801/ 6200 | consumed samples: 1844224 | consumed tokens: 3776970752 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.887473E+00 | loss scale: 1024.0 | grad norm: 6.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.985 | TFLOPs: 42.06 | +[default7]: iteration 1802/ 6200 | consumed samples: 1845248 | consumed tokens: 3779067904 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.891499E+00 | loss scale: 1024.0 | grad norm: 6.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.052 | TFLOPs: 42.08 | +[default7]: iteration 1803/ 6200 | consumed samples: 1846272 | consumed tokens: 3781165056 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851337E+00 | loss scale: 1024.0 | grad norm: 6.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.673 | TFLOPs: 41.96 | +[default7]: iteration 1804/ 6200 | consumed samples: 1847296 | consumed tokens: 3783262208 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.905801E+00 | loss scale: 1024.0 | grad norm: 5.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.313 | TFLOPs: 41.85 | +[default7]: iteration 1805/ 6200 | consumed samples: 1848320 | consumed tokens: 3785359360 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.902066E+00 | loss scale: 1024.0 | grad norm: 5.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.397 | TFLOPs: 41.88 | +[default7]: iteration 1806/ 6200 | consumed samples: 1849344 | consumed tokens: 3787456512 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.886889E+00 | loss scale: 1024.0 | grad norm: 5.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.725 | TFLOPs: 41.98 | +[default7]: iteration 1807/ 6200 | consumed samples: 1850368 | consumed tokens: 3789553664 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851788E+00 | loss scale: 1024.0 | grad norm: 4.686 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.016 | TFLOPs: 42.07 | +[default7]: iteration 1808/ 6200 | consumed samples: 1851392 | consumed tokens: 3791650816 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.868094E+00 | loss scale: 1024.0 | grad norm: 6.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.056 | TFLOPs: 42.08 | +[default7]: iteration 1809/ 6200 | consumed samples: 1852416 | consumed tokens: 3793747968 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.876623E+00 | loss scale: 1024.0 | grad norm: 5.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.987 | TFLOPs: 42.06 | +[default7]: iteration 1810/ 6200 | consumed samples: 1853440 | consumed tokens: 3795845120 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.877374E+00 | loss scale: 1024.0 | grad norm: 5.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.050 | TFLOPs: 42.08 | +[default7]: iteration 1811/ 6200 | consumed samples: 1854464 | consumed tokens: 3797942272 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.876283E+00 | loss scale: 1024.0 | grad norm: 5.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.792 | TFLOPs: 42.00 | +[default7]: iteration 1812/ 6200 | consumed samples: 1855488 | consumed tokens: 3800039424 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.883542E+00 | loss scale: 1024.0 | grad norm: 5.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.951 | TFLOPs: 42.05 | +[default7]: iteration 1813/ 6200 | consumed samples: 1856512 | consumed tokens: 3802136576 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.860138E+00 | loss scale: 1024.0 | grad norm: 5.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.910 | TFLOPs: 42.04 | +[default7]: iteration 1814/ 6200 | consumed samples: 1857536 | consumed tokens: 3804233728 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.862647E+00 | loss scale: 1024.0 | grad norm: 6.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.778 | TFLOPs: 42.00 | +[default7]: iteration 1815/ 6200 | consumed samples: 1858560 | consumed tokens: 3806330880 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.895851E+00 | loss scale: 1024.0 | grad norm: 6.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.340 | TFLOPs: 42.17 | +[default7]: iteration 1816/ 6200 | consumed samples: 1859584 | consumed tokens: 3808428032 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.871531E+00 | loss scale: 1024.0 | grad norm: 6.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.266 | TFLOPs: 42.14 | +[default7]: iteration 1817/ 6200 | consumed samples: 1860608 | consumed tokens: 3810525184 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.872134E+00 | loss scale: 1024.0 | grad norm: 5.863 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.164 | TFLOPs: 42.11 | +[default7]: iteration 1818/ 6200 | consumed samples: 1861632 | consumed tokens: 3812622336 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.898216E+00 | loss scale: 1024.0 | grad norm: 6.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.037 | TFLOPs: 42.07 | +[default7]: iteration 1819/ 6200 | consumed samples: 1862656 | consumed tokens: 3814719488 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.881359E+00 | loss scale: 1024.0 | grad norm: 5.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.284 | TFLOPs: 42.15 | +[default7]: iteration 1820/ 6200 | consumed samples: 1863680 | consumed tokens: 3816816640 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.870617E+00 | loss scale: 1024.0 | grad norm: 6.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.346 | TFLOPs: 42.17 | +[default7]: iteration 1821/ 6200 | consumed samples: 1864704 | consumed tokens: 3818913792 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.882387E+00 | loss scale: 1024.0 | grad norm: 4.754 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.100 | TFLOPs: 42.09 | +[default7]: iteration 1822/ 6200 | consumed samples: 1865728 | consumed tokens: 3821010944 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.862104E+00 | loss scale: 1024.0 | grad norm: 5.777 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.271 | TFLOPs: 42.15 | +[default7]: iteration 1823/ 6200 | consumed samples: 1866752 | consumed tokens: 3823108096 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.891045E+00 | loss scale: 1024.0 | grad norm: 5.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.910 | TFLOPs: 42.04 | +[default7]: iteration 1824/ 6200 | consumed samples: 1867776 | consumed tokens: 3825205248 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.875217E+00 | loss scale: 1024.0 | grad norm: 5.755 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.235 | TFLOPs: 42.14 | +[default7]: iteration 1825/ 6200 | consumed samples: 1868800 | consumed tokens: 3827302400 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.906159E+00 | loss scale: 1024.0 | grad norm: 5.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.356 | TFLOPs: 42.17 | +[default7]: iteration 1826/ 6200 | consumed samples: 1869824 | consumed tokens: 3829399552 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.850512E+00 | loss scale: 1024.0 | grad norm: 5.942 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.221 | TFLOPs: 42.13 | +[default7]: iteration 1827/ 6200 | consumed samples: 1870848 | consumed tokens: 3831496704 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.885264E+00 | loss scale: 1024.0 | grad norm: 5.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.199 | TFLOPs: 42.12 | +[default7]: iteration 1828/ 6200 | consumed samples: 1871872 | consumed tokens: 3833593856 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.891327E+00 | loss scale: 1024.0 | grad norm: 5.606 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.156 | TFLOPs: 42.11 | +[default7]: iteration 1829/ 6200 | consumed samples: 1872896 | consumed tokens: 3835691008 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.883755E+00 | loss scale: 1024.0 | grad norm: 6.059 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.336 | TFLOPs: 42.17 | +[default7]: iteration 1830/ 6200 | consumed samples: 1873920 | consumed tokens: 3837788160 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847221E+00 | loss scale: 1024.0 | grad norm: 6.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.482 | TFLOPs: 42.21 | +[default7]: iteration 1831/ 6200 | consumed samples: 1874944 | consumed tokens: 3839885312 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.866298E+00 | loss scale: 1024.0 | grad norm: 5.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.286 | TFLOPs: 42.15 | +[default7]: iteration 1832/ 6200 | consumed samples: 1875968 | consumed tokens: 3841982464 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.877820E+00 | loss scale: 1024.0 | grad norm: 5.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.505 | TFLOPs: 42.22 | +[default7]: iteration 1833/ 6200 | consumed samples: 1876992 | consumed tokens: 3844079616 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.870744E+00 | loss scale: 1024.0 | grad norm: 5.855 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.327 | TFLOPs: 42.16 | +[default7]: iteration 1834/ 6200 | consumed samples: 1878016 | consumed tokens: 3846176768 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.869169E+00 | loss scale: 1024.0 | grad norm: 5.687 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.077 | TFLOPs: 42.09 | +[default7]: iteration 1835/ 6200 | consumed samples: 1879040 | consumed tokens: 3848273920 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.856438E+00 | loss scale: 1024.0 | grad norm: 5.727 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.972 | TFLOPs: 42.05 | +[default7]: iteration 1836/ 6200 | consumed samples: 1880064 | consumed tokens: 3850371072 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884637E+00 | loss scale: 1024.0 | grad norm: 6.987 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.935 | TFLOPs: 42.04 | +[default7]: iteration 1837/ 6200 | consumed samples: 1881088 | consumed tokens: 3852468224 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.890116E+00 | loss scale: 1024.0 | grad norm: 6.958 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.037 | TFLOPs: 42.07 | +[default7]: iteration 1838/ 6200 | consumed samples: 1882112 | consumed tokens: 3854565376 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.846393E+00 | loss scale: 1024.0 | grad norm: 6.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.223 | TFLOPs: 42.13 | +[default7]: iteration 1839/ 6200 | consumed samples: 1883136 | consumed tokens: 3856662528 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.880093E+00 | loss scale: 1024.0 | grad norm: 5.959 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.327 | TFLOPs: 42.16 | +[default7]: iteration 1840/ 6200 | consumed samples: 1884160 | consumed tokens: 3858759680 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.862715E+00 | loss scale: 1024.0 | grad norm: 5.675 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.821 | TFLOPs: 42.01 | +[default7]: iteration 1841/ 6200 | consumed samples: 1885184 | consumed tokens: 3860856832 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.875620E+00 | loss scale: 1024.0 | grad norm: 6.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.525 | TFLOPs: 41.92 | +[default7]: iteration 1842/ 6200 | consumed samples: 1886208 | consumed tokens: 3862953984 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.853157E+00 | loss scale: 1024.0 | grad norm: 6.714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.198 | TFLOPs: 41.82 | +[default7]: iteration 1843/ 6200 | consumed samples: 1887232 | consumed tokens: 3865051136 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.869411E+00 | loss scale: 1024.0 | grad norm: 7.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.198 | TFLOPs: 41.82 | +[default7]: iteration 1844/ 6200 | consumed samples: 1888256 | consumed tokens: 3867148288 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.890525E+00 | loss scale: 1024.0 | grad norm: 6.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.008 | TFLOPs: 41.76 | +[default7]: iteration 1845/ 6200 | consumed samples: 1889280 | consumed tokens: 3869245440 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.850518E+00 | loss scale: 1024.0 | grad norm: 6.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.268 | TFLOPs: 41.84 | +[default7]: iteration 1846/ 6200 | consumed samples: 1890304 | consumed tokens: 3871342592 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.864898E+00 | loss scale: 1024.0 | grad norm: 6.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.960 | TFLOPs: 41.75 | +[default7]: iteration 1847/ 6200 | consumed samples: 1891328 | consumed tokens: 3873439744 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.850129E+00 | loss scale: 1024.0 | grad norm: 5.779 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.559 | TFLOPs: 41.93 | +[default7]: iteration 1848/ 6200 | consumed samples: 1892352 | consumed tokens: 3875536896 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.876422E+00 | loss scale: 1024.0 | grad norm: 6.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.521 | TFLOPs: 41.92 | +[default7]: iteration 1849/ 6200 | consumed samples: 1893376 | consumed tokens: 3877634048 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.878996E+00 | loss scale: 1024.0 | grad norm: 6.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.399 | TFLOPs: 41.88 | +[default7]: iteration 1850/ 6200 | consumed samples: 1894400 | consumed tokens: 3879731200 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.859664E+00 | loss scale: 1024.0 | grad norm: 5.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.389 | TFLOPs: 41.88 | +[default7]: iteration 1851/ 6200 | consumed samples: 1895424 | consumed tokens: 3881828352 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.875832E+00 | loss scale: 1024.0 | grad norm: 5.657 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.689 | TFLOPs: 41.97 | +[default7]: iteration 1852/ 6200 | consumed samples: 1896448 | consumed tokens: 3883925504 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.829319E+00 | loss scale: 1024.0 | grad norm: 5.912 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.863 | TFLOPs: 41.72 | +[default7]: iteration 1853/ 6200 | consumed samples: 1897472 | consumed tokens: 3886022656 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839005E+00 | loss scale: 1024.0 | grad norm: 5.785 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.667 | TFLOPs: 41.96 | +[default7]: iteration 1854/ 6200 | consumed samples: 1898496 | consumed tokens: 3888119808 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.887502E+00 | loss scale: 1024.0 | grad norm: 5.970 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.473 | TFLOPs: 41.90 | +[default7]: iteration 1855/ 6200 | consumed samples: 1899520 | consumed tokens: 3890216960 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.874881E+00 | loss scale: 1024.0 | grad norm: 5.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.625 | TFLOPs: 41.95 | +[default7]: iteration 1856/ 6200 | consumed samples: 1900544 | consumed tokens: 3892314112 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.866116E+00 | loss scale: 1024.0 | grad norm: 6.773 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.595 | TFLOPs: 41.94 | +[default7]: iteration 1857/ 6200 | consumed samples: 1901568 | consumed tokens: 3894411264 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.864856E+00 | loss scale: 1024.0 | grad norm: 6.671 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.408 | TFLOPs: 42.19 | +[default7]: iteration 1858/ 6200 | consumed samples: 1902592 | consumed tokens: 3896508416 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.854657E+00 | loss scale: 1024.0 | grad norm: 8.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.523 | TFLOPs: 42.22 | +[default7]: iteration 1859/ 6200 | consumed samples: 1903616 | consumed tokens: 3898605568 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.873632E+00 | loss scale: 1024.0 | grad norm: 7.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.301 | TFLOPs: 42.16 | +[default7]: iteration 1860/ 6200 | consumed samples: 1904640 | consumed tokens: 3900702720 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884069E+00 | loss scale: 1024.0 | grad norm: 5.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.716 | TFLOPs: 42.28 | +[default7]: iteration 1861/ 6200 | consumed samples: 1905664 | consumed tokens: 3902799872 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.836463E+00 | loss scale: 1024.0 | grad norm: 7.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.612 | TFLOPs: 42.25 | +[default7]: iteration 1862/ 6200 | consumed samples: 1906688 | consumed tokens: 3904897024 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.850121E+00 | loss scale: 1024.0 | grad norm: 7.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.711 | TFLOPs: 42.28 | +[default7]: iteration 1863/ 6200 | consumed samples: 1907712 | consumed tokens: 3906994176 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.864937E+00 | loss scale: 1024.0 | grad norm: 5.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.671 | TFLOPs: 42.27 | +[default7]: iteration 1864/ 6200 | consumed samples: 1908736 | consumed tokens: 3909091328 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.876333E+00 | loss scale: 1024.0 | grad norm: 5.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.176 | TFLOPs: 42.12 | +[default7]: iteration 1865/ 6200 | consumed samples: 1909760 | consumed tokens: 3911188480 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.844403E+00 | loss scale: 1024.0 | grad norm: 5.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.522 | TFLOPs: 42.22 | +[default7]: iteration 1866/ 6200 | consumed samples: 1910784 | consumed tokens: 3913285632 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.871863E+00 | loss scale: 1024.0 | grad norm: 5.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.513 | TFLOPs: 42.22 | +[default7]: iteration 1867/ 6200 | consumed samples: 1911808 | consumed tokens: 3915382784 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.875853E+00 | loss scale: 1024.0 | grad norm: 6.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.399 | TFLOPs: 42.19 | +[default7]: iteration 1868/ 6200 | consumed samples: 1912832 | consumed tokens: 3917479936 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.906569E+00 | loss scale: 1024.0 | grad norm: 6.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.344 | TFLOPs: 42.17 | +[default7]: iteration 1869/ 6200 | consumed samples: 1913856 | consumed tokens: 3919577088 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.870271E+00 | loss scale: 1024.0 | grad norm: 5.064 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.295 | TFLOPs: 42.15 | +[default7]: iteration 1870/ 6200 | consumed samples: 1914880 | consumed tokens: 3921674240 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.861176E+00 | loss scale: 1024.0 | grad norm: 5.977 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.233 | TFLOPs: 42.13 | +[default7]: iteration 1871/ 6200 | consumed samples: 1915904 | consumed tokens: 3923771392 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.844732E+00 | loss scale: 1024.0 | grad norm: 7.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.332 | TFLOPs: 42.16 | +[default7]: iteration 1872/ 6200 | consumed samples: 1916928 | consumed tokens: 3925868544 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.883576E+00 | loss scale: 1024.0 | grad norm: 5.971 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.443 | TFLOPs: 41.89 | +[default7]: iteration 1873/ 6200 | consumed samples: 1917952 | consumed tokens: 3927965696 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.859213E+00 | loss scale: 1024.0 | grad norm: 5.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.329 | TFLOPs: 41.86 | +[default7]: iteration 1874/ 6200 | consumed samples: 1918976 | consumed tokens: 3930062848 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.889606E+00 | loss scale: 1024.0 | grad norm: 5.825 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.265 | TFLOPs: 41.84 | +[default7]: iteration 1875/ 6200 | consumed samples: 1920000 | consumed tokens: 3932160000 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.881671E+00 | loss scale: 1024.0 | grad norm: 5.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.557 | TFLOPs: 41.93 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 1875 | lm loss value: 3.479232E+00 | lm loss PPL: 3.243480E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 1875 | lm loss value: 1.771044E+00 | lm loss PPL: 5.876988E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 1876/ 6200 | consumed samples: 1921024 | consumed tokens: 3934257152 | elapsed time per iteration (s): 52.18 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.835962E+00 | loss scale: 1024.0 | grad norm: 6.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.625 | TFLOPs: 5.98 | +[default7]: iteration 1877/ 6200 | consumed samples: 1922048 | consumed tokens: 3936354304 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.877734E+00 | loss scale: 1024.0 | grad norm: 5.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.342 | TFLOPs: 41.86 | +[default7]: iteration 1878/ 6200 | consumed samples: 1923072 | consumed tokens: 3938451456 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.860529E+00 | loss scale: 1024.0 | grad norm: 5.068 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.961 | TFLOPs: 41.75 | +[default7]: iteration 1879/ 6200 | consumed samples: 1924096 | consumed tokens: 3940548608 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.860202E+00 | loss scale: 1024.0 | grad norm: 5.629 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.197 | TFLOPs: 42.12 | +[default7]: iteration 1880/ 6200 | consumed samples: 1925120 | consumed tokens: 3942645760 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.866311E+00 | loss scale: 1024.0 | grad norm: 5.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.661 | TFLOPs: 41.96 | +[default7]: iteration 1881/ 6200 | consumed samples: 1926144 | consumed tokens: 3944742912 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.886110E+00 | loss scale: 1024.0 | grad norm: 5.727 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.024 | TFLOPs: 42.07 | +[default7]: iteration 1882/ 6200 | consumed samples: 1927168 | consumed tokens: 3946840064 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.864193E+00 | loss scale: 1024.0 | grad norm: 6.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.528 | TFLOPs: 42.22 | +[default7]: iteration 1883/ 6200 | consumed samples: 1928192 | consumed tokens: 3948937216 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857838E+00 | loss scale: 1024.0 | grad norm: 5.950 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.495 | TFLOPs: 42.21 | +[default7]: iteration 1884/ 6200 | consumed samples: 1929216 | consumed tokens: 3951034368 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.886912E+00 | loss scale: 1024.0 | grad norm: 5.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.315 | TFLOPs: 42.16 | +[default7]: iteration 1885/ 6200 | consumed samples: 1930240 | consumed tokens: 3953131520 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.871372E+00 | loss scale: 1024.0 | grad norm: 5.932 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.528 | TFLOPs: 42.22 | +[default7]: iteration 1886/ 6200 | consumed samples: 1931264 | consumed tokens: 3955228672 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.848325E+00 | loss scale: 1024.0 | grad norm: 5.865 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.416 | TFLOPs: 42.19 | +[default7]: iteration 1887/ 6200 | consumed samples: 1932288 | consumed tokens: 3957325824 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.888613E+00 | loss scale: 1024.0 | grad norm: 5.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.189 | TFLOPs: 42.12 | +[default7]: iteration 1888/ 6200 | consumed samples: 1933312 | consumed tokens: 3959422976 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.858522E+00 | loss scale: 1024.0 | grad norm: 5.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.253 | TFLOPs: 42.14 | +[default7]: iteration 1889/ 6200 | consumed samples: 1934336 | consumed tokens: 3961520128 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.897267E+00 | loss scale: 1024.0 | grad norm: 6.825 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.340 | TFLOPs: 42.17 | +[default7]: iteration 1890/ 6200 | consumed samples: 1935360 | consumed tokens: 3963617280 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.849155E+00 | loss scale: 1024.0 | grad norm: 6.042 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.409 | TFLOPs: 42.19 | +[default7]: iteration 1891/ 6200 | consumed samples: 1936384 | consumed tokens: 3965714432 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.861182E+00 | loss scale: 1024.0 | grad norm: 6.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.354 | TFLOPs: 42.17 | +[default7]: iteration 1892/ 6200 | consumed samples: 1937408 | consumed tokens: 3967811584 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.882339E+00 | loss scale: 1024.0 | grad norm: 6.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.082 | TFLOPs: 42.09 | +[default7]: iteration 1893/ 6200 | consumed samples: 1938432 | consumed tokens: 3969908736 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.870227E+00 | loss scale: 1024.0 | grad norm: 5.698 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.240 | TFLOPs: 42.14 | +[default7]: iteration 1894/ 6200 | consumed samples: 1939456 | consumed tokens: 3972005888 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.864034E+00 | loss scale: 1024.0 | grad norm: 5.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.476 | TFLOPs: 42.21 | +[default7]: iteration 1895/ 6200 | consumed samples: 1940480 | consumed tokens: 3974103040 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.866873E+00 | loss scale: 1024.0 | grad norm: 6.006 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.559 | TFLOPs: 42.23 | +[default7]: iteration 1896/ 6200 | consumed samples: 1941504 | consumed tokens: 3976200192 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.909214E+00 | loss scale: 1024.0 | grad norm: 5.967 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.554 | TFLOPs: 42.23 | +[default7]: iteration 1897/ 6200 | consumed samples: 1942528 | consumed tokens: 3978297344 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.869699E+00 | loss scale: 1024.0 | grad norm: 5.664 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.295 | TFLOPs: 42.15 | +[default7]: iteration 1898/ 6200 | consumed samples: 1943552 | consumed tokens: 3980394496 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.879496E+00 | loss scale: 1024.0 | grad norm: 5.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.704 | TFLOPs: 42.28 | +[default7]: iteration 1899/ 6200 | consumed samples: 1944576 | consumed tokens: 3982491648 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884988E+00 | loss scale: 1024.0 | grad norm: 5.942 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.344 | TFLOPs: 42.17 | +[default7]: iteration 1900/ 6200 | consumed samples: 1945600 | consumed tokens: 3984588800 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.877533E+00 | loss scale: 1024.0 | grad norm: 5.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.534 | TFLOPs: 42.23 | +[default7]: iteration 1901/ 6200 | consumed samples: 1946624 | consumed tokens: 3986685952 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.870806E+00 | loss scale: 1024.0 | grad norm: 6.897 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.342 | TFLOPs: 42.17 | +[default7]: iteration 1902/ 6200 | consumed samples: 1947648 | consumed tokens: 3988783104 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.863915E+00 | loss scale: 1024.0 | grad norm: 5.847 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.401 | TFLOPs: 42.19 | +[default7]: iteration 1903/ 6200 | consumed samples: 1948672 | consumed tokens: 3990880256 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851517E+00 | loss scale: 1024.0 | grad norm: 5.788 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.396 | TFLOPs: 42.18 | +[default7]: iteration 1904/ 6200 | consumed samples: 1949696 | consumed tokens: 3992977408 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.842880E+00 | loss scale: 1024.0 | grad norm: 5.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.085 | TFLOPs: 42.09 | +[default7]: iteration 1905/ 6200 | consumed samples: 1950720 | consumed tokens: 3995074560 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.848158E+00 | loss scale: 1024.0 | grad norm: 5.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.513 | TFLOPs: 42.22 | +[default7]: iteration 1906/ 6200 | consumed samples: 1951744 | consumed tokens: 3997171712 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.854683E+00 | loss scale: 1024.0 | grad norm: 5.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.296 | TFLOPs: 42.15 | +[default7]: iteration 1907/ 6200 | consumed samples: 1952768 | consumed tokens: 3999268864 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.840152E+00 | loss scale: 1024.0 | grad norm: 5.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.414 | TFLOPs: 42.19 | +[default7]: iteration 1908/ 6200 | consumed samples: 1953792 | consumed tokens: 4001366016 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.859367E+00 | loss scale: 1024.0 | grad norm: 5.843 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.646 | TFLOPs: 42.26 | +[default7]: iteration 1909/ 6200 | consumed samples: 1954816 | consumed tokens: 4003463168 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.890608E+00 | loss scale: 1024.0 | grad norm: 5.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.435 | TFLOPs: 42.20 | +[default7]: iteration 1910/ 6200 | consumed samples: 1955840 | consumed tokens: 4005560320 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.880297E+00 | loss scale: 1024.0 | grad norm: 7.725 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.465 | TFLOPs: 42.21 | +[default7]: iteration 1911/ 6200 | consumed samples: 1956864 | consumed tokens: 4007657472 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.858947E+00 | loss scale: 1024.0 | grad norm: 5.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.544 | TFLOPs: 42.23 | +[default7]: iteration 1912/ 6200 | consumed samples: 1957888 | consumed tokens: 4009754624 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.832862E+00 | loss scale: 1024.0 | grad norm: 5.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.654 | TFLOPs: 42.26 | +[default7]: iteration 1913/ 6200 | consumed samples: 1958912 | consumed tokens: 4011851776 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.852574E+00 | loss scale: 1024.0 | grad norm: 5.842 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.801 | TFLOPs: 42.31 | +[default7]: iteration 1914/ 6200 | consumed samples: 1959936 | consumed tokens: 4013948928 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.881449E+00 | loss scale: 1024.0 | grad norm: 5.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.748 | TFLOPs: 42.29 | +[default7]: iteration 1915/ 6200 | consumed samples: 1960960 | consumed tokens: 4016046080 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884231E+00 | loss scale: 1024.0 | grad norm: 7.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.308 | TFLOPs: 42.16 | +[default7]: iteration 1916/ 6200 | consumed samples: 1961984 | consumed tokens: 4018143232 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.880514E+00 | loss scale: 1024.0 | grad norm: 5.871 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.755 | TFLOPs: 42.29 | +[default7]: iteration 1917/ 6200 | consumed samples: 1963008 | consumed tokens: 4020240384 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.841924E+00 | loss scale: 1024.0 | grad norm: 6.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.231 | TFLOPs: 42.13 | +[default7]: iteration 1918/ 6200 | consumed samples: 1964032 | consumed tokens: 4022337536 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.868562E+00 | loss scale: 1024.0 | grad norm: 7.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.097 | TFLOPs: 42.09 | +[default7]: iteration 1919/ 6200 | consumed samples: 1965056 | consumed tokens: 4024434688 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.890961E+00 | loss scale: 1024.0 | grad norm: 5.813 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.572 | TFLOPs: 42.24 | +[default7]: iteration 1920/ 6200 | consumed samples: 1966080 | consumed tokens: 4026531840 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.871148E+00 | loss scale: 1024.0 | grad norm: 5.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.340 | TFLOPs: 42.17 | +[default7]: iteration 1921/ 6200 | consumed samples: 1967104 | consumed tokens: 4028628992 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.850283E+00 | loss scale: 1024.0 | grad norm: 6.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.574 | TFLOPs: 42.24 | +[default7]: iteration 1922/ 6200 | consumed samples: 1968128 | consumed tokens: 4030726144 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.860019E+00 | loss scale: 1024.0 | grad norm: 5.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.194 | TFLOPs: 42.12 | +[default7]: iteration 1923/ 6200 | consumed samples: 1969152 | consumed tokens: 4032823296 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.873042E+00 | loss scale: 1024.0 | grad norm: 7.702 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.231 | TFLOPs: 42.13 | +[default7]: iteration 1924/ 6200 | consumed samples: 1970176 | consumed tokens: 4034920448 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.854432E+00 | loss scale: 1024.0 | grad norm: 5.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.762 | TFLOPs: 41.99 | +[default7]: iteration 1925/ 6200 | consumed samples: 1971200 | consumed tokens: 4037017600 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.885629E+00 | loss scale: 1024.0 | grad norm: 5.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.173 | TFLOPs: 42.12 | +[default7]: iteration 1926/ 6200 | consumed samples: 1972224 | consumed tokens: 4039114752 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.833112E+00 | loss scale: 1024.0 | grad norm: 7.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.577 | TFLOPs: 42.24 | +[default7]: iteration 1927/ 6200 | consumed samples: 1973248 | consumed tokens: 4041211904 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.880811E+00 | loss scale: 1024.0 | grad norm: 7.732 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.642 | TFLOPs: 42.26 | +[default7]: iteration 1928/ 6200 | consumed samples: 1974272 | consumed tokens: 4043309056 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.890372E+00 | loss scale: 1024.0 | grad norm: 5.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.676 | TFLOPs: 42.27 | +[default7]: iteration 1929/ 6200 | consumed samples: 1975296 | consumed tokens: 4045406208 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.879639E+00 | loss scale: 1024.0 | grad norm: 5.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.457 | TFLOPs: 42.20 | +[default7]: iteration 1930/ 6200 | consumed samples: 1976320 | consumed tokens: 4047503360 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.863548E+00 | loss scale: 1024.0 | grad norm: 9.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.617 | TFLOPs: 42.25 | +[default7]: iteration 1931/ 6200 | consumed samples: 1977344 | consumed tokens: 4049600512 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.840765E+00 | loss scale: 1024.0 | grad norm: 8.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.659 | TFLOPs: 42.26 | +[default7]: iteration 1932/ 6200 | consumed samples: 1978368 | consumed tokens: 4051697664 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847644E+00 | loss scale: 1024.0 | grad norm: 5.648 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.714 | TFLOPs: 42.28 | +[default7]: iteration 1933/ 6200 | consumed samples: 1979392 | consumed tokens: 4053794816 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.902638E+00 | loss scale: 1024.0 | grad norm: 5.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.661 | TFLOPs: 42.27 | +[default7]: iteration 1934/ 6200 | consumed samples: 1980416 | consumed tokens: 4055891968 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.886585E+00 | loss scale: 1024.0 | grad norm: 7.682 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.285 | TFLOPs: 42.15 | +[default7]: iteration 1935/ 6200 | consumed samples: 1981440 | consumed tokens: 4057989120 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.848179E+00 | loss scale: 1024.0 | grad norm: 5.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.175 | TFLOPs: 42.12 | +[default7]: iteration 1936/ 6200 | consumed samples: 1982464 | consumed tokens: 4060086272 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.858443E+00 | loss scale: 1024.0 | grad norm: 6.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.811 | TFLOPs: 42.31 | +[default7]: iteration 1937/ 6200 | consumed samples: 1983488 | consumed tokens: 4062183424 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.846483E+00 | loss scale: 1024.0 | grad norm: 5.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.905 | TFLOPs: 42.34 | +[default7]: iteration 1938/ 6200 | consumed samples: 1984512 | consumed tokens: 4064280576 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.856886E+00 | loss scale: 1024.0 | grad norm: 6.004 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.629 | TFLOPs: 42.26 | +[default7]: iteration 1939/ 6200 | consumed samples: 1985536 | consumed tokens: 4066377728 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847040E+00 | loss scale: 1024.0 | grad norm: 5.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.637 | TFLOPs: 42.26 | +[default7]: iteration 1940/ 6200 | consumed samples: 1986560 | consumed tokens: 4068474880 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.866948E+00 | loss scale: 1024.0 | grad norm: 5.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.556 | TFLOPs: 42.23 | +[default7]: iteration 1941/ 6200 | consumed samples: 1987584 | consumed tokens: 4070572032 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.890900E+00 | loss scale: 1024.0 | grad norm: 7.075 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.454 | TFLOPs: 42.20 | +[default7]: iteration 1942/ 6200 | consumed samples: 1988608 | consumed tokens: 4072669184 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.882833E+00 | loss scale: 1024.0 | grad norm: 6.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.361 | TFLOPs: 42.17 | +[default7]: iteration 1943/ 6200 | consumed samples: 1989632 | consumed tokens: 4074766336 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857416E+00 | loss scale: 1024.0 | grad norm: 6.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.144 | TFLOPs: 42.11 | +[default7]: iteration 1944/ 6200 | consumed samples: 1990656 | consumed tokens: 4076863488 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.862533E+00 | loss scale: 1024.0 | grad norm: 6.028 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.542 | TFLOPs: 42.23 | +[default7]: iteration 1945/ 6200 | consumed samples: 1991680 | consumed tokens: 4078960640 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.862120E+00 | loss scale: 1024.0 | grad norm: 6.811 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.748 | TFLOPs: 42.29 | +[default7]: iteration 1946/ 6200 | consumed samples: 1992704 | consumed tokens: 4081057792 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.867316E+00 | loss scale: 1024.0 | grad norm: 5.822 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.306 | TFLOPs: 42.16 | +[default7]: iteration 1947/ 6200 | consumed samples: 1993728 | consumed tokens: 4083154944 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.828259E+00 | loss scale: 1024.0 | grad norm: 5.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.415 | TFLOPs: 41.89 | +[default7]: iteration 1948/ 6200 | consumed samples: 1994752 | consumed tokens: 4085252096 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.829090E+00 | loss scale: 1024.0 | grad norm: 5.973 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.574 | TFLOPs: 41.93 | +[default7]: iteration 1949/ 6200 | consumed samples: 1995776 | consumed tokens: 4087349248 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.880745E+00 | loss scale: 1024.0 | grad norm: 7.900 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.670 | TFLOPs: 41.96 | +[default7]: iteration 1950/ 6200 | consumed samples: 1996800 | consumed tokens: 4089446400 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.817352E+00 | loss scale: 1024.0 | grad norm: 6.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.731 | TFLOPs: 41.98 | +[default7]: iteration 1951/ 6200 | consumed samples: 1997824 | consumed tokens: 4091543552 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.856360E+00 | loss scale: 1024.0 | grad norm: 5.988 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.819 | TFLOPs: 42.01 | +[default7]: iteration 1952/ 6200 | consumed samples: 1998848 | consumed tokens: 4093640704 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.891742E+00 | loss scale: 1024.0 | grad norm: 9.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.770 | TFLOPs: 41.99 | +[default7]: iteration 1953/ 6200 | consumed samples: 1999872 | consumed tokens: 4095737856 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.897217E+00 | loss scale: 1024.0 | grad norm: 7.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.593 | TFLOPs: 41.94 | +[default7]: iteration 1954/ 6200 | consumed samples: 2000896 | consumed tokens: 4097835008 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857720E+00 | loss scale: 1024.0 | grad norm: 7.036 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.598 | TFLOPs: 41.94 | +[default7]: iteration 1955/ 6200 | consumed samples: 2001920 | consumed tokens: 4099932160 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.840320E+00 | loss scale: 1024.0 | grad norm: 5.743 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.088 | TFLOPs: 42.09 | +[default7]: iteration 1956/ 6200 | consumed samples: 2002944 | consumed tokens: 4102029312 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.881620E+00 | loss scale: 1024.0 | grad norm: 7.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.753 | TFLOPs: 41.99 | +[default7]: iteration 1957/ 6200 | consumed samples: 2003968 | consumed tokens: 4104126464 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851033E+00 | loss scale: 1024.0 | grad norm: 7.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.239 | TFLOPs: 42.14 | +[default7]: iteration 1958/ 6200 | consumed samples: 2004992 | consumed tokens: 4106223616 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.858378E+00 | loss scale: 1024.0 | grad norm: 6.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.467 | TFLOPs: 42.21 | +[default7]: iteration 1959/ 6200 | consumed samples: 2006016 | consumed tokens: 4108320768 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847237E+00 | loss scale: 1024.0 | grad norm: 6.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.341 | TFLOPs: 42.17 | +[default7]: iteration 1960/ 6200 | consumed samples: 2007040 | consumed tokens: 4110417920 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.855510E+00 | loss scale: 1024.0 | grad norm: 5.575 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.114 | TFLOPs: 42.10 | +[default7]: iteration 1961/ 6200 | consumed samples: 2008064 | consumed tokens: 4112515072 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.853652E+00 | loss scale: 1024.0 | grad norm: 5.914 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.456 | TFLOPs: 42.20 | +[default7]: iteration 1962/ 6200 | consumed samples: 2009088 | consumed tokens: 4114612224 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.859088E+00 | loss scale: 1024.0 | grad norm: 5.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.524 | TFLOPs: 42.22 | +[default7]: iteration 1963/ 6200 | consumed samples: 2010112 | consumed tokens: 4116709376 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.838443E+00 | loss scale: 1024.0 | grad norm: 5.818 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.301 | TFLOPs: 42.16 | +[default7]: iteration 1964/ 6200 | consumed samples: 2011136 | consumed tokens: 4118806528 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.867976E+00 | loss scale: 1024.0 | grad norm: 6.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.296 | TFLOPs: 42.15 | +[default7]: iteration 1965/ 6200 | consumed samples: 2012160 | consumed tokens: 4120903680 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.877912E+00 | loss scale: 1024.0 | grad norm: 6.875 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.266 | TFLOPs: 42.14 | +[default7]: iteration 1966/ 6200 | consumed samples: 2013184 | consumed tokens: 4123000832 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.858289E+00 | loss scale: 1024.0 | grad norm: 8.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.479 | TFLOPs: 42.21 | +[default7]: iteration 1967/ 6200 | consumed samples: 2014208 | consumed tokens: 4125097984 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.861542E+00 | loss scale: 1024.0 | grad norm: 8.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.691 | TFLOPs: 42.27 | +[default7]: iteration 1968/ 6200 | consumed samples: 2015232 | consumed tokens: 4127195136 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.858137E+00 | loss scale: 1024.0 | grad norm: 6.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.684 | TFLOPs: 42.27 | +[default7]: iteration 1969/ 6200 | consumed samples: 2016256 | consumed tokens: 4129292288 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.846801E+00 | loss scale: 1024.0 | grad norm: 6.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.772 | TFLOPs: 42.30 | +[default7]: iteration 1970/ 6200 | consumed samples: 2017280 | consumed tokens: 4131389440 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.870466E+00 | loss scale: 1024.0 | grad norm: 7.775 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.933 | TFLOPs: 42.35 | +[default7]: iteration 1971/ 6200 | consumed samples: 2018304 | consumed tokens: 4133486592 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.862342E+00 | loss scale: 1024.0 | grad norm: 8.006 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.607 | TFLOPs: 42.25 | +[default7]: iteration 1972/ 6200 | consumed samples: 2019328 | consumed tokens: 4135583744 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.835491E+00 | loss scale: 1024.0 | grad norm: 6.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.420 | TFLOPs: 42.19 | +[default7]: iteration 1973/ 6200 | consumed samples: 2020352 | consumed tokens: 4137680896 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.854745E+00 | loss scale: 1024.0 | grad norm: 6.689 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.805 | TFLOPs: 42.31 | +[default7]: iteration 1974/ 6200 | consumed samples: 2021376 | consumed tokens: 4139778048 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.850527E+00 | loss scale: 1024.0 | grad norm: 7.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.797 | TFLOPs: 42.31 | +[default7]: iteration 1975/ 6200 | consumed samples: 2022400 | consumed tokens: 4141875200 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847921E+00 | loss scale: 1024.0 | grad norm: 7.734 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.550 | TFLOPs: 42.23 | +[default7]: iteration 1976/ 6200 | consumed samples: 2023424 | consumed tokens: 4143972352 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.850502E+00 | loss scale: 1024.0 | grad norm: 6.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.753 | TFLOPs: 42.29 | +[default7]: iteration 1977/ 6200 | consumed samples: 2024448 | consumed tokens: 4146069504 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.873695E+00 | loss scale: 1024.0 | grad norm: 6.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.485 | TFLOPs: 42.21 | +[default7]: iteration 1978/ 6200 | consumed samples: 2025472 | consumed tokens: 4148166656 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.850046E+00 | loss scale: 1024.0 | grad norm: 5.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.759 | TFLOPs: 41.99 | +[default7]: iteration 1979/ 6200 | consumed samples: 2026496 | consumed tokens: 4150263808 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.879351E+00 | loss scale: 1024.0 | grad norm: 5.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.413 | TFLOPs: 42.19 | +[default7]: iteration 1980/ 6200 | consumed samples: 2027520 | consumed tokens: 4152360960 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.853695E+00 | loss scale: 1024.0 | grad norm: 7.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.654 | TFLOPs: 41.96 | +[default7]: iteration 1981/ 6200 | consumed samples: 2028544 | consumed tokens: 4154458112 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851348E+00 | loss scale: 1024.0 | grad norm: 6.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.827 | TFLOPs: 42.01 | +[default7]: iteration 1982/ 6200 | consumed samples: 2029568 | consumed tokens: 4156555264 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.863871E+00 | loss scale: 1024.0 | grad norm: 5.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.994 | TFLOPs: 42.06 | +[default7]: iteration 1983/ 6200 | consumed samples: 2030592 | consumed tokens: 4158652416 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.854505E+00 | loss scale: 1024.0 | grad norm: 5.916 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.119 | TFLOPs: 42.10 | +[default7]: iteration 1984/ 6200 | consumed samples: 2031616 | consumed tokens: 4160749568 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.826978E+00 | loss scale: 1024.0 | grad norm: 6.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.138 | TFLOPs: 42.11 | +[default7]: iteration 1985/ 6200 | consumed samples: 2032640 | consumed tokens: 4162846720 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.845955E+00 | loss scale: 1024.0 | grad norm: 5.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.071 | TFLOPs: 42.09 | +[default7]: iteration 1986/ 6200 | consumed samples: 2033664 | consumed tokens: 4164943872 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851453E+00 | loss scale: 1024.0 | grad norm: 5.864 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.539 | TFLOPs: 42.23 | +[default7]: iteration 1987/ 6200 | consumed samples: 2034688 | consumed tokens: 4167041024 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857708E+00 | loss scale: 1024.0 | grad norm: 5.904 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.580 | TFLOPs: 42.24 | +[default7]: iteration 1988/ 6200 | consumed samples: 2035712 | consumed tokens: 4169138176 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.848211E+00 | loss scale: 1024.0 | grad norm: 5.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.463 | TFLOPs: 42.20 | +[default7]: iteration 1989/ 6200 | consumed samples: 2036736 | consumed tokens: 4171235328 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.848383E+00 | loss scale: 1024.0 | grad norm: 6.018 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.788 | TFLOPs: 42.30 | +[default7]: iteration 1990/ 6200 | consumed samples: 2037760 | consumed tokens: 4173332480 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.840460E+00 | loss scale: 1024.0 | grad norm: 6.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.121 | TFLOPs: 42.10 | +[default7]: iteration 1991/ 6200 | consumed samples: 2038784 | consumed tokens: 4175429632 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.842152E+00 | loss scale: 1024.0 | grad norm: 5.703 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.935 | TFLOPs: 42.04 | +[default7]: iteration 1992/ 6200 | consumed samples: 2039808 | consumed tokens: 4177526784 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.838600E+00 | loss scale: 1024.0 | grad norm: 5.902 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.003 | TFLOPs: 42.06 | +[default7]: iteration 1993/ 6200 | consumed samples: 2040832 | consumed tokens: 4179623936 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.845682E+00 | loss scale: 1024.0 | grad norm: 5.928 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.046 | TFLOPs: 42.08 | +[default7]: iteration 1994/ 6200 | consumed samples: 2041856 | consumed tokens: 4181721088 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.844906E+00 | loss scale: 1024.0 | grad norm: 5.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.357 | TFLOPs: 42.17 | +[default7]: iteration 1995/ 6200 | consumed samples: 2042880 | consumed tokens: 4183818240 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.858111E+00 | loss scale: 1024.0 | grad norm: 6.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.026 | TFLOPs: 42.07 | +[default7]: iteration 1996/ 6200 | consumed samples: 2043904 | consumed tokens: 4185915392 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.856309E+00 | loss scale: 1024.0 | grad norm: 6.850 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.253 | TFLOPs: 42.14 | +[default7]: iteration 1997/ 6200 | consumed samples: 2044928 | consumed tokens: 4188012544 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.872926E+00 | loss scale: 1024.0 | grad norm: 6.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.934 | TFLOPs: 42.04 | +[default7]: iteration 1998/ 6200 | consumed samples: 2045952 | consumed tokens: 4190109696 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.890196E+00 | loss scale: 1024.0 | grad norm: 6.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.451 | TFLOPs: 42.20 | +[default7]: iteration 1999/ 6200 | consumed samples: 2046976 | consumed tokens: 4192206848 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.854943E+00 | loss scale: 1024.0 | grad norm: 5.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.220 | TFLOPs: 42.13 | +[default0]:[2022-10-06 17:01:09,827] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=7, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default7]: iteration 2000/ 6200 | consumed samples: 2048000 | consumed tokens: 4194304000 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.862989E+00 | loss scale: 1024.0 | grad norm: 5.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.008 | TFLOPs: 42.07 | +[default0]:steps: 2000 loss: 1.8630 iter time (s): 7.381 samples/sec: 138.733 +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 2000 | lm loss value: 3.501493E+00 | lm loss PPL: 3.316492E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 2000 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 17:01:54,533] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is begin to save! +[default0]:[2022-10-06 17:01:54,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_01-model_00-model_states.pt... +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 2000 | lm loss value: 1.747644E+00 | lm loss PPL: 5.741063E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:[2022-10-06 17:01:54,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:54,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,041] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,067] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,118] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,145] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,199] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,225] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,285] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,311] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,337] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,369] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,450] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,475] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,504] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,530] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_25-model_00-model_states.pt... +[default6]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 17:01:55,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 17:01:55,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 17:01:55,640] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/mp_rank_00_model_states.pt +[default0]:[2022-10-06 17:01:55,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 17:01:55,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 17:01:55,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 17:01:55,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 17:01:55,845] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 17:01:55,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 17:01:55,844] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 17:01:55,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 17:01:55,871] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 17:01:55,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 17:01:55,859] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 17:01:55,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 17:01:55,916] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 17:01:55,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 17:01:55,856] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 17:01:55,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 17:01:55,853] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 17:01:55,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 17:01:55,875] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 17:01:55,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 17:01:55,878] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 17:01:55,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 17:01:55,912] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 17:01:55,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 17:01:55,964] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 17:01:55,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 17:01:55,916] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 17:01:55,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 17:01:55,991] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 17:01:55,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 17:01:55,984] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 17:01:55,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 17:01:55,956] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 17:01:55,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 17:01:55,957] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 17:01:55,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 17:01:55,980] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 17:01:55,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 17:01:55,973] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 17:01:55,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 17:01:55,974] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 17:01:55,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 17:01:55,954] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 17:01:55,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 17:01:55,962] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 17:01:55,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 17:01:55,963] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 17:01:55,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 17:01:55,958] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 17:01:55,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 17:01:55,964] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 17:01:55,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 17:01:55,979] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 17:01:56,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 17:01:56,016] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default4]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default6]:[2022-10-06 17:01:56,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 17:01:56,022] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default7]:[2022-10-06 17:01:56,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 17:01:56,068] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default7]:time (ms) | save-checkpoint: 1558.82 +[default5]:[2022-10-06 17:01:56,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 17:01:56,091] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default5]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default6]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default5]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default2]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default0]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default1]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default2]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default7]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default4]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default3]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default1]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default7]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default4]:[2022-10-06 17:01:56,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 17:01:56,066] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default2]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default0]:[2022-10-06 17:01:56,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 17:01:56,069] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default3]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default4]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default1]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default1]:[2022-10-06 17:01:56,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 17:01:56,082] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2000/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default3]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default2]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default7]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default6]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default5]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default0]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default0]: successfully saved checkpoint at iteration 2000 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default3]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default0]:[2022-10-06 17:01:56,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +[default7]: iteration 2001/ 6200 | consumed samples: 2049024 | consumed tokens: 4196401152 | elapsed time per iteration (s): 53.69 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.881608E+00 | loss scale: 1024.0 | grad norm: 6.023 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.073 | TFLOPs: 5.81 | +[default7]: iteration 2002/ 6200 | consumed samples: 2050048 | consumed tokens: 4198498304 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.879254E+00 | loss scale: 1024.0 | grad norm: 6.042 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.099 | TFLOPs: 42.09 | +[default7]: iteration 2003/ 6200 | consumed samples: 2051072 | consumed tokens: 4200595456 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.855276E+00 | loss scale: 1024.0 | grad norm: 6.030 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.927 | TFLOPs: 42.04 | +[default7]: iteration 2004/ 6200 | consumed samples: 2052096 | consumed tokens: 4202692608 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.867416E+00 | loss scale: 1024.0 | grad norm: 5.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.895 | TFLOPs: 42.03 | +[default7]: iteration 2005/ 6200 | consumed samples: 2053120 | consumed tokens: 4204789760 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.859429E+00 | loss scale: 1024.0 | grad norm: 8.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.124 | TFLOPs: 42.10 | +[default7]: iteration 2006/ 6200 | consumed samples: 2054144 | consumed tokens: 4206886912 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.855753E+00 | loss scale: 1024.0 | grad norm: 6.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.973 | TFLOPs: 42.06 | +[default7]: iteration 2007/ 6200 | consumed samples: 2055168 | consumed tokens: 4208984064 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.865842E+00 | loss scale: 1024.0 | grad norm: 6.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.123 | TFLOPs: 42.10 | +[default7]: iteration 2008/ 6200 | consumed samples: 2056192 | consumed tokens: 4211081216 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857870E+00 | loss scale: 1024.0 | grad norm: 5.718 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.944 | TFLOPs: 42.05 | +[default7]: iteration 2009/ 6200 | consumed samples: 2057216 | consumed tokens: 4213178368 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.835428E+00 | loss scale: 1024.0 | grad norm: 4.867 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.569 | TFLOPs: 42.24 | +[default7]: iteration 2010/ 6200 | consumed samples: 2058240 | consumed tokens: 4215275520 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.887007E+00 | loss scale: 1024.0 | grad norm: 6.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.106 | TFLOPs: 42.10 | +[default7]: iteration 2011/ 6200 | consumed samples: 2059264 | consumed tokens: 4217372672 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.849667E+00 | loss scale: 1024.0 | grad norm: 6.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.760 | TFLOPs: 41.99 | +[default7]: iteration 2012/ 6200 | consumed samples: 2060288 | consumed tokens: 4219469824 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.856086E+00 | loss scale: 1024.0 | grad norm: 7.938 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.581 | TFLOPs: 41.94 | +[default7]: iteration 2013/ 6200 | consumed samples: 2061312 | consumed tokens: 4221566976 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.875997E+00 | loss scale: 1024.0 | grad norm: 5.863 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.848 | TFLOPs: 42.02 | +[default7]: iteration 2014/ 6200 | consumed samples: 2062336 | consumed tokens: 4223664128 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.837794E+00 | loss scale: 1024.0 | grad norm: 6.973 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.287 | TFLOPs: 41.85 | +[default7]: iteration 2015/ 6200 | consumed samples: 2063360 | consumed tokens: 4225761280 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.846912E+00 | loss scale: 1024.0 | grad norm: 7.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.454 | TFLOPs: 41.90 | +[default7]: iteration 2016/ 6200 | consumed samples: 2064384 | consumed tokens: 4227858432 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.843719E+00 | loss scale: 1024.0 | grad norm: 6.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.925 | TFLOPs: 42.04 | +[default7]: iteration 2017/ 6200 | consumed samples: 2065408 | consumed tokens: 4229955584 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.833901E+00 | loss scale: 1024.0 | grad norm: 6.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.705 | TFLOPs: 41.97 | +[default7]: iteration 2018/ 6200 | consumed samples: 2066432 | consumed tokens: 4232052736 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839484E+00 | loss scale: 1024.0 | grad norm: 9.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.336 | TFLOPs: 42.17 | +[default7]: iteration 2019/ 6200 | consumed samples: 2067456 | consumed tokens: 4234149888 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.865279E+00 | loss scale: 1024.0 | grad norm: 7.955 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.711 | TFLOPs: 42.28 | +[default7]: iteration 2020/ 6200 | consumed samples: 2068480 | consumed tokens: 4236247040 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847981E+00 | loss scale: 1024.0 | grad norm: 5.693 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.596 | TFLOPs: 42.25 | +[default7]: iteration 2021/ 6200 | consumed samples: 2069504 | consumed tokens: 4238344192 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.861593E+00 | loss scale: 1024.0 | grad norm: 8.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.918 | TFLOPs: 42.34 | +[default7]: iteration 2022/ 6200 | consumed samples: 2070528 | consumed tokens: 4240441344 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.861093E+00 | loss scale: 1024.0 | grad norm: 7.915 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.829 | TFLOPs: 42.32 | +[default7]: iteration 2023/ 6200 | consumed samples: 2071552 | consumed tokens: 4242538496 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857777E+00 | loss scale: 1024.0 | grad norm: 6.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.489 | TFLOPs: 42.21 | +[default7]: iteration 2024/ 6200 | consumed samples: 2072576 | consumed tokens: 4244635648 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.841124E+00 | loss scale: 1024.0 | grad norm: 6.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.711 | TFLOPs: 42.28 | +[default7]: iteration 2025/ 6200 | consumed samples: 2073600 | consumed tokens: 4246732800 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.855856E+00 | loss scale: 1024.0 | grad norm: 6.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.862 | TFLOPs: 42.33 | +[default7]: iteration 2026/ 6200 | consumed samples: 2074624 | consumed tokens: 4248829952 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.852163E+00 | loss scale: 1024.0 | grad norm: 5.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.708 | TFLOPs: 42.28 | +[default7]: iteration 2027/ 6200 | consumed samples: 2075648 | consumed tokens: 4250927104 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.871083E+00 | loss scale: 1024.0 | grad norm: 5.714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.950 | TFLOPs: 42.35 | +[default7]: iteration 2028/ 6200 | consumed samples: 2076672 | consumed tokens: 4253024256 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.840733E+00 | loss scale: 1024.0 | grad norm: 5.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.803 | TFLOPs: 42.31 | +[default7]: iteration 2029/ 6200 | consumed samples: 2077696 | consumed tokens: 4255121408 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.843479E+00 | loss scale: 1024.0 | grad norm: 6.003 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.684 | TFLOPs: 42.27 | +[default7]: iteration 2030/ 6200 | consumed samples: 2078720 | consumed tokens: 4257218560 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.836846E+00 | loss scale: 1024.0 | grad norm: 5.831 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.106 | TFLOPs: 42.40 | +[default7]: iteration 2031/ 6200 | consumed samples: 2079744 | consumed tokens: 4259315712 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.848376E+00 | loss scale: 1024.0 | grad norm: 5.522 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.261 | TFLOPs: 42.45 | +[default7]: iteration 2032/ 6200 | consumed samples: 2080768 | consumed tokens: 4261412864 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.855462E+00 | loss scale: 1024.0 | grad norm: 5.929 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.140 | TFLOPs: 42.41 | +[default7]: iteration 2033/ 6200 | consumed samples: 2081792 | consumed tokens: 4263510016 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847963E+00 | loss scale: 1024.0 | grad norm: 5.889 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.384 | TFLOPs: 42.49 | +[default7]: iteration 2034/ 6200 | consumed samples: 2082816 | consumed tokens: 4265607168 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.842999E+00 | loss scale: 1024.0 | grad norm: 5.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.434 | TFLOPs: 42.20 | +[default7]: iteration 2035/ 6200 | consumed samples: 2083840 | consumed tokens: 4267704320 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851361E+00 | loss scale: 1024.0 | grad norm: 5.833 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.332 | TFLOPs: 42.16 | +[default7]: iteration 2036/ 6200 | consumed samples: 2084864 | consumed tokens: 4269801472 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.853189E+00 | loss scale: 1024.0 | grad norm: 5.770 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.077 | TFLOPs: 42.39 | +[default7]: iteration 2037/ 6200 | consumed samples: 2085888 | consumed tokens: 4271898624 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.827766E+00 | loss scale: 1024.0 | grad norm: 6.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.843 | TFLOPs: 42.32 | +[default7]: iteration 2038/ 6200 | consumed samples: 2086912 | consumed tokens: 4273995776 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.865850E+00 | loss scale: 1024.0 | grad norm: 6.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.084 | TFLOPs: 42.39 | +[default7]: iteration 2039/ 6200 | consumed samples: 2087936 | consumed tokens: 4276092928 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.842978E+00 | loss scale: 1024.0 | grad norm: 7.047 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.007 | TFLOPs: 42.37 | +[default7]: iteration 2040/ 6200 | consumed samples: 2088960 | consumed tokens: 4278190080 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.877636E+00 | loss scale: 1024.0 | grad norm: 7.021 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.178 | TFLOPs: 42.42 | +[default7]: iteration 2041/ 6200 | consumed samples: 2089984 | consumed tokens: 4280287232 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801769E+00 | loss scale: 1024.0 | grad norm: 7.941 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.859 | TFLOPs: 42.33 | +[default7]: iteration 2042/ 6200 | consumed samples: 2091008 | consumed tokens: 4282384384 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.846324E+00 | loss scale: 1024.0 | grad norm: 6.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.634 | TFLOPs: 42.26 | +[default7]: iteration 2043/ 6200 | consumed samples: 2092032 | consumed tokens: 4284481536 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.853064E+00 | loss scale: 1024.0 | grad norm: 6.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.655 | TFLOPs: 42.26 | +[default7]: iteration 2044/ 6200 | consumed samples: 2093056 | consumed tokens: 4286578688 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.852537E+00 | loss scale: 1024.0 | grad norm: 6.593 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.457 | TFLOPs: 42.20 | +[default7]: iteration 2045/ 6200 | consumed samples: 2094080 | consumed tokens: 4288675840 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.838694E+00 | loss scale: 1024.0 | grad norm: 6.913 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.609 | TFLOPs: 42.25 | +[default7]: iteration 2046/ 6200 | consumed samples: 2095104 | consumed tokens: 4290772992 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857454E+00 | loss scale: 1024.0 | grad norm: 6.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.718 | TFLOPs: 42.28 | +[default7]: iteration 2047/ 6200 | consumed samples: 2096128 | consumed tokens: 4292870144 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.824490E+00 | loss scale: 1024.0 | grad norm: 6.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.524 | TFLOPs: 42.22 | +[default7]: iteration 2048/ 6200 | consumed samples: 2097152 | consumed tokens: 4294967296 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.826191E+00 | loss scale: 1024.0 | grad norm: 6.826 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.602 | TFLOPs: 42.25 | +[default7]: iteration 2049/ 6200 | consumed samples: 2098176 | consumed tokens: 4297064448 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.834993E+00 | loss scale: 1024.0 | grad norm: 5.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.820 | TFLOPs: 42.31 | +[default7]: iteration 2050/ 6200 | consumed samples: 2099200 | consumed tokens: 4299161600 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.841015E+00 | loss scale: 1024.0 | grad norm: 6.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.380 | TFLOPs: 42.18 | +[default7]: iteration 2051/ 6200 | consumed samples: 2100224 | consumed tokens: 4301258752 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.831528E+00 | loss scale: 1024.0 | grad norm: 6.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.367 | TFLOPs: 42.18 | +[default7]: iteration 2052/ 6200 | consumed samples: 2101248 | consumed tokens: 4303355904 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.848258E+00 | loss scale: 1024.0 | grad norm: 7.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.406 | TFLOPs: 42.19 | +[default7]: iteration 2053/ 6200 | consumed samples: 2102272 | consumed tokens: 4305453056 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.821685E+00 | loss scale: 1024.0 | grad norm: 6.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.581 | TFLOPs: 42.24 | +[default7]: iteration 2054/ 6200 | consumed samples: 2103296 | consumed tokens: 4307550208 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.848280E+00 | loss scale: 1024.0 | grad norm: 5.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.430 | TFLOPs: 42.19 | +[default7]: iteration 2055/ 6200 | consumed samples: 2104320 | consumed tokens: 4309647360 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.862720E+00 | loss scale: 1024.0 | grad norm: 5.888 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.123 | TFLOPs: 42.41 | +[default7]: iteration 2056/ 6200 | consumed samples: 2105344 | consumed tokens: 4311744512 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.878028E+00 | loss scale: 1024.0 | grad norm: 6.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.736 | TFLOPs: 42.29 | +[default7]: iteration 2057/ 6200 | consumed samples: 2106368 | consumed tokens: 4313841664 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.823823E+00 | loss scale: 1024.0 | grad norm: 5.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.290 | TFLOPs: 42.15 | +[default7]: iteration 2058/ 6200 | consumed samples: 2107392 | consumed tokens: 4315938816 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.865573E+00 | loss scale: 1024.0 | grad norm: 5.819 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.638 | TFLOPs: 42.26 | +[default7]: iteration 2059/ 6200 | consumed samples: 2108416 | consumed tokens: 4318035968 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.854869E+00 | loss scale: 1024.0 | grad norm: 5.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.421 | TFLOPs: 42.19 | +[default7]: iteration 2060/ 6200 | consumed samples: 2109440 | consumed tokens: 4320133120 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.826812E+00 | loss scale: 1024.0 | grad norm: 6.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.418 | TFLOPs: 42.19 | +[default7]: iteration 2061/ 6200 | consumed samples: 2110464 | consumed tokens: 4322230272 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.883874E+00 | loss scale: 1024.0 | grad norm: 6.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.669 | TFLOPs: 42.27 | +[default7]: iteration 2062/ 6200 | consumed samples: 2111488 | consumed tokens: 4324327424 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.863928E+00 | loss scale: 1024.0 | grad norm: 6.012 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.810 | TFLOPs: 42.31 | +[default7]: iteration 2063/ 6200 | consumed samples: 2112512 | consumed tokens: 4326424576 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.845330E+00 | loss scale: 1024.0 | grad norm: 5.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.723 | TFLOPs: 42.28 | +[default7]: iteration 2064/ 6200 | consumed samples: 2113536 | consumed tokens: 4328521728 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839047E+00 | loss scale: 1024.0 | grad norm: 5.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.604 | TFLOPs: 42.25 | +[default7]: iteration 2065/ 6200 | consumed samples: 2114560 | consumed tokens: 4330618880 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.865240E+00 | loss scale: 1024.0 | grad norm: 6.784 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.017 | TFLOPs: 42.37 | +[default7]: iteration 2066/ 6200 | consumed samples: 2115584 | consumed tokens: 4332716032 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857385E+00 | loss scale: 1024.0 | grad norm: 6.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.113 | TFLOPs: 42.40 | +[default7]: iteration 2067/ 6200 | consumed samples: 2116608 | consumed tokens: 4334813184 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.837790E+00 | loss scale: 1024.0 | grad norm: 6.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.565 | TFLOPs: 42.24 | +[default7]: iteration 2068/ 6200 | consumed samples: 2117632 | consumed tokens: 4336910336 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.846948E+00 | loss scale: 1024.0 | grad norm: 6.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.955 | TFLOPs: 42.35 | +[default7]: iteration 2069/ 6200 | consumed samples: 2118656 | consumed tokens: 4339007488 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.841825E+00 | loss scale: 1024.0 | grad norm: 5.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.723 | TFLOPs: 42.28 | +[default7]: iteration 2070/ 6200 | consumed samples: 2119680 | consumed tokens: 4341104640 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.846796E+00 | loss scale: 1024.0 | grad norm: 6.012 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.616 | TFLOPs: 42.25 | +[default7]: iteration 2071/ 6200 | consumed samples: 2120704 | consumed tokens: 4343201792 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847186E+00 | loss scale: 1024.0 | grad norm: 6.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.953 | TFLOPs: 42.35 | +[default7]: iteration 2072/ 6200 | consumed samples: 2121728 | consumed tokens: 4345298944 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.828442E+00 | loss scale: 1024.0 | grad norm: 7.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.123 | TFLOPs: 42.41 | +[default7]: iteration 2073/ 6200 | consumed samples: 2122752 | consumed tokens: 4347396096 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857121E+00 | loss scale: 1024.0 | grad norm: 6.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.652 | TFLOPs: 42.26 | +[default7]: iteration 2074/ 6200 | consumed samples: 2123776 | consumed tokens: 4349493248 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.859267E+00 | loss scale: 1024.0 | grad norm: 7.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.109 | TFLOPs: 42.40 | +[default7]: iteration 2075/ 6200 | consumed samples: 2124800 | consumed tokens: 4351590400 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857677E+00 | loss scale: 1024.0 | grad norm: 6.895 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.099 | TFLOPs: 42.40 | +[default7]: iteration 2076/ 6200 | consumed samples: 2125824 | consumed tokens: 4353687552 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.845567E+00 | loss scale: 1024.0 | grad norm: 6.846 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.135 | TFLOPs: 42.41 | +[default7]: iteration 2077/ 6200 | consumed samples: 2126848 | consumed tokens: 4355784704 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851205E+00 | loss scale: 1024.0 | grad norm: 5.705 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.591 | TFLOPs: 42.24 | +[default7]: iteration 2078/ 6200 | consumed samples: 2127872 | consumed tokens: 4357881856 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.885021E+00 | loss scale: 1024.0 | grad norm: 6.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.136 | TFLOPs: 42.41 | +[default7]: iteration 2079/ 6200 | consumed samples: 2128896 | consumed tokens: 4359979008 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.828263E+00 | loss scale: 1024.0 | grad norm: 4.829 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.157 | TFLOPs: 42.42 | +[default7]: iteration 2080/ 6200 | consumed samples: 2129920 | consumed tokens: 4362076160 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.833791E+00 | loss scale: 1024.0 | grad norm: 5.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.780 | TFLOPs: 42.30 | +[default7]: iteration 2081/ 6200 | consumed samples: 2130944 | consumed tokens: 4364173312 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.843753E+00 | loss scale: 1024.0 | grad norm: 5.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.917 | TFLOPs: 42.34 | +[default7]: iteration 2082/ 6200 | consumed samples: 2131968 | consumed tokens: 4366270464 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.865993E+00 | loss scale: 1024.0 | grad norm: 6.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.832 | TFLOPs: 42.32 | +[default7]: iteration 2083/ 6200 | consumed samples: 2132992 | consumed tokens: 4368367616 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839413E+00 | loss scale: 1024.0 | grad norm: 7.905 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.503 | TFLOPs: 42.22 | +[default7]: iteration 2084/ 6200 | consumed samples: 2134016 | consumed tokens: 4370464768 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.894155E+00 | loss scale: 1024.0 | grad norm: 6.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.871 | TFLOPs: 42.33 | +[default7]: iteration 2085/ 6200 | consumed samples: 2135040 | consumed tokens: 4372561920 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.846212E+00 | loss scale: 1024.0 | grad norm: 7.954 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.088 | TFLOPs: 42.40 | +[default7]: iteration 2086/ 6200 | consumed samples: 2136064 | consumed tokens: 4374659072 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.852864E+00 | loss scale: 1024.0 | grad norm: 7.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.238 | TFLOPs: 42.44 | +[default7]: iteration 2087/ 6200 | consumed samples: 2137088 | consumed tokens: 4376756224 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847249E+00 | loss scale: 1024.0 | grad norm: 5.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.039 | TFLOPs: 42.38 | +[default7]: iteration 2088/ 6200 | consumed samples: 2138112 | consumed tokens: 4378853376 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.854167E+00 | loss scale: 1024.0 | grad norm: 7.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.512 | TFLOPs: 42.52 | +[default7]: iteration 2089/ 6200 | consumed samples: 2139136 | consumed tokens: 4380950528 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.846841E+00 | loss scale: 1024.0 | grad norm: 6.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.713 | TFLOPs: 42.28 | +[default7]: iteration 2090/ 6200 | consumed samples: 2140160 | consumed tokens: 4383047680 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.884232E+00 | loss scale: 1024.0 | grad norm: 7.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.756 | TFLOPs: 42.29 | +[default7]: iteration 2091/ 6200 | consumed samples: 2141184 | consumed tokens: 4385144832 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.868460E+00 | loss scale: 1024.0 | grad norm: 6.019 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.554 | TFLOPs: 42.23 | +[default7]: iteration 2092/ 6200 | consumed samples: 2142208 | consumed tokens: 4387241984 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839711E+00 | loss scale: 1024.0 | grad norm: 6.784 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.336 | TFLOPs: 42.17 | +[default7]: iteration 2093/ 6200 | consumed samples: 2143232 | consumed tokens: 4389339136 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.818996E+00 | loss scale: 1024.0 | grad norm: 6.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.700 | TFLOPs: 42.28 | +[default7]: iteration 2094/ 6200 | consumed samples: 2144256 | consumed tokens: 4391436288 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.872766E+00 | loss scale: 1024.0 | grad norm: 5.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.723 | TFLOPs: 42.28 | +[default7]: iteration 2095/ 6200 | consumed samples: 2145280 | consumed tokens: 4393533440 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.842491E+00 | loss scale: 1024.0 | grad norm: 6.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.841 | TFLOPs: 42.32 | +[default7]: iteration 2096/ 6200 | consumed samples: 2146304 | consumed tokens: 4395630592 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.832824E+00 | loss scale: 1024.0 | grad norm: 8.705 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.244 | TFLOPs: 42.44 | +[default7]: iteration 2097/ 6200 | consumed samples: 2147328 | consumed tokens: 4397727744 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857124E+00 | loss scale: 1024.0 | grad norm: 6.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.927 | TFLOPs: 42.35 | +[default7]: iteration 2098/ 6200 | consumed samples: 2148352 | consumed tokens: 4399824896 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.855446E+00 | loss scale: 1024.0 | grad norm: 6.031 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.135 | TFLOPs: 42.41 | +[default7]: iteration 2099/ 6200 | consumed samples: 2149376 | consumed tokens: 4401922048 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.866505E+00 | loss scale: 1024.0 | grad norm: 6.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.812 | TFLOPs: 42.31 | +[default7]: iteration 2100/ 6200 | consumed samples: 2150400 | consumed tokens: 4404019200 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.848947E+00 | loss scale: 1024.0 | grad norm: 8.676 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.777 | TFLOPs: 42.30 | +[default7]: iteration 2101/ 6200 | consumed samples: 2151424 | consumed tokens: 4406116352 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.838657E+00 | loss scale: 1024.0 | grad norm: 6.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.813 | TFLOPs: 42.31 | +[default7]: iteration 2102/ 6200 | consumed samples: 2152448 | consumed tokens: 4408213504 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.866265E+00 | loss scale: 1024.0 | grad norm: 6.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.899 | TFLOPs: 42.34 | +[default7]: iteration 2103/ 6200 | consumed samples: 2153472 | consumed tokens: 4410310656 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.820786E+00 | loss scale: 1024.0 | grad norm: 5.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.926 | TFLOPs: 42.35 | +[default7]: iteration 2104/ 6200 | consumed samples: 2154496 | consumed tokens: 4412407808 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.849180E+00 | loss scale: 1024.0 | grad norm: 6.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.941 | TFLOPs: 42.35 | +[default7]: iteration 2105/ 6200 | consumed samples: 2155520 | consumed tokens: 4414504960 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.855469E+00 | loss scale: 1024.0 | grad norm: 6.030 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.487 | TFLOPs: 42.21 | +[default7]: iteration 2106/ 6200 | consumed samples: 2156544 | consumed tokens: 4416602112 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.837997E+00 | loss scale: 1024.0 | grad norm: 7.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.759 | TFLOPs: 41.99 | +[default7]: iteration 2107/ 6200 | consumed samples: 2157568 | consumed tokens: 4418699264 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.829554E+00 | loss scale: 1024.0 | grad norm: 5.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.333 | TFLOPs: 42.16 | +[default7]: iteration 2108/ 6200 | consumed samples: 2158592 | consumed tokens: 4420796416 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.825690E+00 | loss scale: 1024.0 | grad norm: 6.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.541 | TFLOPs: 42.23 | +[default7]: iteration 2109/ 6200 | consumed samples: 2159616 | consumed tokens: 4422893568 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.853426E+00 | loss scale: 1024.0 | grad norm: 8.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.967 | TFLOPs: 42.05 | +[default7]: iteration 2110/ 6200 | consumed samples: 2160640 | consumed tokens: 4424990720 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.870776E+00 | loss scale: 1024.0 | grad norm: 7.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.182 | TFLOPs: 42.12 | +[default7]: iteration 2111/ 6200 | consumed samples: 2161664 | consumed tokens: 4427087872 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.828496E+00 | loss scale: 1024.0 | grad norm: 6.011 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.452 | TFLOPs: 42.20 | +[default7]: iteration 2112/ 6200 | consumed samples: 2162688 | consumed tokens: 4429185024 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.858522E+00 | loss scale: 1024.0 | grad norm: 5.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.865 | TFLOPs: 42.33 | +[default7]: iteration 2113/ 6200 | consumed samples: 2163712 | consumed tokens: 4431282176 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.844854E+00 | loss scale: 1024.0 | grad norm: 10.027 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.036 | TFLOPs: 42.38 | +[default7]: iteration 2114/ 6200 | consumed samples: 2164736 | consumed tokens: 4433379328 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839218E+00 | loss scale: 1024.0 | grad norm: 7.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.809 | TFLOPs: 42.31 | +[default7]: iteration 2115/ 6200 | consumed samples: 2165760 | consumed tokens: 4435476480 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857691E+00 | loss scale: 1024.0 | grad norm: 7.814 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.892 | TFLOPs: 42.34 | +[default7]: iteration 2116/ 6200 | consumed samples: 2166784 | consumed tokens: 4437573632 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.841726E+00 | loss scale: 1024.0 | grad norm: 5.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.906 | TFLOPs: 42.34 | +[default7]: iteration 2117/ 6200 | consumed samples: 2167808 | consumed tokens: 4439670784 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857353E+00 | loss scale: 1024.0 | grad norm: 7.575 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.106 | TFLOPs: 42.40 | +[default7]: iteration 2118/ 6200 | consumed samples: 2168832 | consumed tokens: 4441767936 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.842830E+00 | loss scale: 1024.0 | grad norm: 5.716 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.153 | TFLOPs: 42.42 | +[default7]: iteration 2119/ 6200 | consumed samples: 2169856 | consumed tokens: 4443865088 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.846773E+00 | loss scale: 1024.0 | grad norm: 5.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.908 | TFLOPs: 42.34 | +[default7]: iteration 2120/ 6200 | consumed samples: 2170880 | consumed tokens: 4445962240 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.844016E+00 | loss scale: 1024.0 | grad norm: 6.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.790 | TFLOPs: 42.30 | +[default7]: iteration 2121/ 6200 | consumed samples: 2171904 | consumed tokens: 4448059392 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.838193E+00 | loss scale: 1024.0 | grad norm: 5.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.856 | TFLOPs: 42.32 | +[default7]: iteration 2122/ 6200 | consumed samples: 2172928 | consumed tokens: 4450156544 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851373E+00 | loss scale: 1024.0 | grad norm: 6.926 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.426 | TFLOPs: 42.19 | +[default7]: iteration 2123/ 6200 | consumed samples: 2173952 | consumed tokens: 4452253696 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.836861E+00 | loss scale: 1024.0 | grad norm: 6.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.901 | TFLOPs: 42.34 | +[default7]: iteration 2124/ 6200 | consumed samples: 2174976 | consumed tokens: 4454350848 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.841451E+00 | loss scale: 1024.0 | grad norm: 5.601 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.507 | TFLOPs: 42.22 | +[default7]: iteration 2125/ 6200 | consumed samples: 2176000 | consumed tokens: 4456448000 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851515E+00 | loss scale: 1024.0 | grad norm: 6.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.753 | TFLOPs: 42.29 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 2125 | lm loss value: 3.511801E+00 | lm loss PPL: 3.350858E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 2125 | lm loss value: 1.737339E+00 | lm loss PPL: 5.682205E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 2126/ 6200 | consumed samples: 2177024 | consumed tokens: 4458545152 | elapsed time per iteration (s): 51.84 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.827887E+00 | loss scale: 1024.0 | grad norm: 7.732 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.754 | TFLOPs: 6.02 | +[default7]: iteration 2127/ 6200 | consumed samples: 2178048 | consumed tokens: 4460642304 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.840102E+00 | loss scale: 1024.0 | grad norm: 6.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.783 | TFLOPs: 42.30 | +[default7]: iteration 2128/ 6200 | consumed samples: 2179072 | consumed tokens: 4462739456 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.821121E+00 | loss scale: 1024.0 | grad norm: 6.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.496 | TFLOPs: 42.21 | +[default7]: iteration 2129/ 6200 | consumed samples: 2180096 | consumed tokens: 4464836608 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.835943E+00 | loss scale: 1024.0 | grad norm: 5.680 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.674 | TFLOPs: 42.27 | +[default7]: iteration 2130/ 6200 | consumed samples: 2181120 | consumed tokens: 4466933760 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.833981E+00 | loss scale: 1024.0 | grad norm: 6.712 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.890 | TFLOPs: 42.33 | +[default7]: iteration 2131/ 6200 | consumed samples: 2182144 | consumed tokens: 4469030912 | elapsed time per iteration (s): 7.84 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.814218E+00 | loss scale: 1024.0 | grad norm: 7.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 130.531 | TFLOPs: 39.79 | +[default7]: iteration 2132/ 6200 | consumed samples: 2183168 | consumed tokens: 4471128064 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.861906E+00 | loss scale: 1024.0 | grad norm: 6.760 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.715 | TFLOPs: 42.28 | +[default7]: iteration 2133/ 6200 | consumed samples: 2184192 | consumed tokens: 4473225216 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.820752E+00 | loss scale: 1024.0 | grad norm: 7.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.186 | TFLOPs: 42.43 | +[default7]: iteration 2134/ 6200 | consumed samples: 2185216 | consumed tokens: 4475322368 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.862155E+00 | loss scale: 1024.0 | grad norm: 6.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.097 | TFLOPs: 42.40 | +[default7]: iteration 2135/ 6200 | consumed samples: 2186240 | consumed tokens: 4477419520 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.844784E+00 | loss scale: 1024.0 | grad norm: 6.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.016 | TFLOPs: 42.37 | +[default7]: iteration 2136/ 6200 | consumed samples: 2187264 | consumed tokens: 4479516672 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.854118E+00 | loss scale: 1024.0 | grad norm: 6.779 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.511 | TFLOPs: 42.22 | +[default7]: iteration 2137/ 6200 | consumed samples: 2188288 | consumed tokens: 4481613824 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.837888E+00 | loss scale: 1024.0 | grad norm: 6.048 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.396 | TFLOPs: 42.18 | +[default7]: iteration 2138/ 6200 | consumed samples: 2189312 | consumed tokens: 4483710976 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.850918E+00 | loss scale: 1024.0 | grad norm: 5.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.121 | TFLOPs: 42.10 | +[default7]: iteration 2139/ 6200 | consumed samples: 2190336 | consumed tokens: 4485808128 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.836301E+00 | loss scale: 1024.0 | grad norm: 7.047 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.978 | TFLOPs: 42.36 | +[default7]: iteration 2140/ 6200 | consumed samples: 2191360 | consumed tokens: 4487905280 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.841535E+00 | loss scale: 1024.0 | grad norm: 5.732 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.922 | TFLOPs: 42.34 | +[default7]: iteration 2141/ 6200 | consumed samples: 2192384 | consumed tokens: 4490002432 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.858860E+00 | loss scale: 1024.0 | grad norm: 5.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.883 | TFLOPs: 42.33 | +[default7]: iteration 2142/ 6200 | consumed samples: 2193408 | consumed tokens: 4492099584 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.877528E+00 | loss scale: 1024.0 | grad norm: 6.074 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.002 | TFLOPs: 42.37 | +[default7]: iteration 2143/ 6200 | consumed samples: 2194432 | consumed tokens: 4494196736 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.853354E+00 | loss scale: 1024.0 | grad norm: 5.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.767 | TFLOPs: 42.30 | +[default7]: iteration 2144/ 6200 | consumed samples: 2195456 | consumed tokens: 4496293888 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.819153E+00 | loss scale: 1024.0 | grad norm: 5.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.834 | TFLOPs: 42.32 | +[default7]: iteration 2145/ 6200 | consumed samples: 2196480 | consumed tokens: 4498391040 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.843497E+00 | loss scale: 1024.0 | grad norm: 5.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.750 | TFLOPs: 42.29 | +[default7]: iteration 2146/ 6200 | consumed samples: 2197504 | consumed tokens: 4500488192 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.818925E+00 | loss scale: 1024.0 | grad norm: 6.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.105 | TFLOPs: 42.40 | +[default7]: iteration 2147/ 6200 | consumed samples: 2198528 | consumed tokens: 4502585344 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.835714E+00 | loss scale: 1024.0 | grad norm: 6.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.549 | TFLOPs: 42.54 | +[default7]: iteration 2148/ 6200 | consumed samples: 2199552 | consumed tokens: 4504682496 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.861907E+00 | loss scale: 1024.0 | grad norm: 7.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.896 | TFLOPs: 42.34 | +[default7]: iteration 2149/ 6200 | consumed samples: 2200576 | consumed tokens: 4506779648 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.843348E+00 | loss scale: 1024.0 | grad norm: 5.985 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.389 | TFLOPs: 42.49 | +[default7]: iteration 2150/ 6200 | consumed samples: 2201600 | consumed tokens: 4508876800 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.825436E+00 | loss scale: 1024.0 | grad norm: 5.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.311 | TFLOPs: 42.46 | +[default7]: iteration 2151/ 6200 | consumed samples: 2202624 | consumed tokens: 4510973952 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.840755E+00 | loss scale: 1024.0 | grad norm: 6.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.320 | TFLOPs: 42.47 | +[default7]: iteration 2152/ 6200 | consumed samples: 2203648 | consumed tokens: 4513071104 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.832036E+00 | loss scale: 1024.0 | grad norm: 7.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.212 | TFLOPs: 42.43 | +[default7]: iteration 2153/ 6200 | consumed samples: 2204672 | consumed tokens: 4515168256 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839605E+00 | loss scale: 1024.0 | grad norm: 5.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.113 | TFLOPs: 42.40 | +[default7]: iteration 2154/ 6200 | consumed samples: 2205696 | consumed tokens: 4517265408 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.844075E+00 | loss scale: 1024.0 | grad norm: 5.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.636 | TFLOPs: 42.26 | +[default7]: iteration 2155/ 6200 | consumed samples: 2206720 | consumed tokens: 4519362560 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.844348E+00 | loss scale: 1024.0 | grad norm: 6.622 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.831 | TFLOPs: 42.32 | +[default7]: iteration 2156/ 6200 | consumed samples: 2207744 | consumed tokens: 4521459712 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847589E+00 | loss scale: 1024.0 | grad norm: 5.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.417 | TFLOPs: 42.19 | +[default7]: iteration 2157/ 6200 | consumed samples: 2208768 | consumed tokens: 4523556864 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.837962E+00 | loss scale: 1024.0 | grad norm: 5.815 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.703 | TFLOPs: 42.28 | +[default7]: iteration 2158/ 6200 | consumed samples: 2209792 | consumed tokens: 4525654016 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.838668E+00 | loss scale: 1024.0 | grad norm: 5.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.799 | TFLOPs: 42.31 | +[default7]: iteration 2159/ 6200 | consumed samples: 2210816 | consumed tokens: 4527751168 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.818729E+00 | loss scale: 1024.0 | grad norm: 5.676 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.704 | TFLOPs: 42.28 | +[default7]: iteration 2160/ 6200 | consumed samples: 2211840 | consumed tokens: 4529848320 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807139E+00 | loss scale: 1024.0 | grad norm: 6.814 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.564 | TFLOPs: 42.24 | +[default7]: iteration 2161/ 6200 | consumed samples: 2212864 | consumed tokens: 4531945472 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.856424E+00 | loss scale: 1024.0 | grad norm: 7.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.674 | TFLOPs: 42.27 | +[default7]: iteration 2162/ 6200 | consumed samples: 2213888 | consumed tokens: 4534042624 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.817733E+00 | loss scale: 1024.0 | grad norm: 5.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.634 | TFLOPs: 42.26 | +[default7]: iteration 2163/ 6200 | consumed samples: 2214912 | consumed tokens: 4536139776 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.859279E+00 | loss scale: 1024.0 | grad norm: 5.970 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.803 | TFLOPs: 42.31 | +[default7]: iteration 2164/ 6200 | consumed samples: 2215936 | consumed tokens: 4538236928 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.853760E+00 | loss scale: 1024.0 | grad norm: 7.819 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.812 | TFLOPs: 42.31 | +[default7]: iteration 2165/ 6200 | consumed samples: 2216960 | consumed tokens: 4540334080 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.831104E+00 | loss scale: 1024.0 | grad norm: 5.696 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.673 | TFLOPs: 42.27 | +[default7]: iteration 2166/ 6200 | consumed samples: 2217984 | consumed tokens: 4542431232 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.824665E+00 | loss scale: 1024.0 | grad norm: 5.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.871 | TFLOPs: 42.33 | +[default7]: iteration 2167/ 6200 | consumed samples: 2219008 | consumed tokens: 4544528384 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.838225E+00 | loss scale: 1024.0 | grad norm: 6.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.890 | TFLOPs: 42.33 | +[default7]: iteration 2168/ 6200 | consumed samples: 2220032 | consumed tokens: 4546625536 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.828914E+00 | loss scale: 1024.0 | grad norm: 5.732 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.407 | TFLOPs: 42.19 | +[default7]: iteration 2169/ 6200 | consumed samples: 2221056 | consumed tokens: 4548722688 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.856973E+00 | loss scale: 1024.0 | grad norm: 5.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.551 | TFLOPs: 42.23 | +[default7]: iteration 2170/ 6200 | consumed samples: 2222080 | consumed tokens: 4550819840 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.848916E+00 | loss scale: 1024.0 | grad norm: 5.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.933 | TFLOPs: 42.35 | +[default7]: iteration 2171/ 6200 | consumed samples: 2223104 | consumed tokens: 4552916992 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.834135E+00 | loss scale: 1024.0 | grad norm: 6.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.395 | TFLOPs: 42.18 | +[default7]: iteration 2172/ 6200 | consumed samples: 2224128 | consumed tokens: 4555014144 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851347E+00 | loss scale: 1024.0 | grad norm: 5.743 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.599 | TFLOPs: 41.94 | +[default7]: iteration 2173/ 6200 | consumed samples: 2225152 | consumed tokens: 4557111296 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851429E+00 | loss scale: 1024.0 | grad norm: 5.782 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.788 | TFLOPs: 42.30 | +[default7]: iteration 2174/ 6200 | consumed samples: 2226176 | consumed tokens: 4559208448 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.815236E+00 | loss scale: 1024.0 | grad norm: 6.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.514 | TFLOPs: 42.22 | +[default7]: iteration 2175/ 6200 | consumed samples: 2227200 | consumed tokens: 4561305600 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851327E+00 | loss scale: 1024.0 | grad norm: 6.803 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.167 | TFLOPs: 42.11 | +[default7]: iteration 2176/ 6200 | consumed samples: 2228224 | consumed tokens: 4563402752 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.827615E+00 | loss scale: 1024.0 | grad norm: 6.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.942 | TFLOPs: 42.35 | +[default7]: iteration 2177/ 6200 | consumed samples: 2229248 | consumed tokens: 4565499904 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.845593E+00 | loss scale: 1024.0 | grad norm: 5.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.636 | TFLOPs: 42.26 | +[default7]: iteration 2178/ 6200 | consumed samples: 2230272 | consumed tokens: 4567597056 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.844209E+00 | loss scale: 1024.0 | grad norm: 7.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.491 | TFLOPs: 42.21 | +[default7]: iteration 2179/ 6200 | consumed samples: 2231296 | consumed tokens: 4569694208 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.852065E+00 | loss scale: 1024.0 | grad norm: 7.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.031 | TFLOPs: 42.38 | +[default7]: iteration 2180/ 6200 | consumed samples: 2232320 | consumed tokens: 4571791360 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.833093E+00 | loss scale: 1024.0 | grad norm: 7.965 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.634 | TFLOPs: 42.26 | +[default7]: iteration 2181/ 6200 | consumed samples: 2233344 | consumed tokens: 4573888512 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799665E+00 | loss scale: 1024.0 | grad norm: 6.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.048 | TFLOPs: 42.38 | +[default7]: iteration 2182/ 6200 | consumed samples: 2234368 | consumed tokens: 4575985664 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.845346E+00 | loss scale: 1024.0 | grad norm: 6.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.870 | TFLOPs: 42.33 | +[default7]: iteration 2183/ 6200 | consumed samples: 2235392 | consumed tokens: 4578082816 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.856383E+00 | loss scale: 1024.0 | grad norm: 8.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.914 | TFLOPs: 42.34 | +[default7]: iteration 2184/ 6200 | consumed samples: 2236416 | consumed tokens: 4580179968 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847683E+00 | loss scale: 1024.0 | grad norm: 6.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.821 | TFLOPs: 42.31 | +[default7]: iteration 2185/ 6200 | consumed samples: 2237440 | consumed tokens: 4582277120 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.855996E+00 | loss scale: 1024.0 | grad norm: 6.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.868 | TFLOPs: 42.33 | +[default7]: iteration 2186/ 6200 | consumed samples: 2238464 | consumed tokens: 4584374272 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.821210E+00 | loss scale: 1024.0 | grad norm: 6.807 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.848 | TFLOPs: 42.32 | +[default7]: iteration 2187/ 6200 | consumed samples: 2239488 | consumed tokens: 4586471424 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.846842E+00 | loss scale: 1024.0 | grad norm: 5.830 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.784 | TFLOPs: 42.30 | +[default7]: iteration 2188/ 6200 | consumed samples: 2240512 | consumed tokens: 4588568576 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.829146E+00 | loss scale: 1024.0 | grad norm: 6.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.678 | TFLOPs: 42.27 | +[default7]: iteration 2189/ 6200 | consumed samples: 2241536 | consumed tokens: 4590665728 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.840710E+00 | loss scale: 1024.0 | grad norm: 6.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.754 | TFLOPs: 42.29 | +[default7]: iteration 2190/ 6200 | consumed samples: 2242560 | consumed tokens: 4592762880 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.836267E+00 | loss scale: 1024.0 | grad norm: 5.752 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.837 | TFLOPs: 42.32 | +[default7]: iteration 2191/ 6200 | consumed samples: 2243584 | consumed tokens: 4594860032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.820094E+00 | loss scale: 1024.0 | grad norm: 5.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.907 | TFLOPs: 42.34 | +[default7]: iteration 2192/ 6200 | consumed samples: 2244608 | consumed tokens: 4596957184 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.842409E+00 | loss scale: 1024.0 | grad norm: 5.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.203 | TFLOPs: 42.13 | +[default7]: iteration 2193/ 6200 | consumed samples: 2245632 | consumed tokens: 4599054336 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.850246E+00 | loss scale: 1024.0 | grad norm: 7.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.266 | TFLOPs: 42.14 | +[default7]: iteration 2194/ 6200 | consumed samples: 2246656 | consumed tokens: 4601151488 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.825771E+00 | loss scale: 1024.0 | grad norm: 5.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.296 | TFLOPs: 42.15 | +[default7]: iteration 2195/ 6200 | consumed samples: 2247680 | consumed tokens: 4603248640 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801833E+00 | loss scale: 1024.0 | grad norm: 7.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.554 | TFLOPs: 42.23 | +[default7]: iteration 2196/ 6200 | consumed samples: 2248704 | consumed tokens: 4605345792 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.826930E+00 | loss scale: 1024.0 | grad norm: 6.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.708 | TFLOPs: 42.28 | +[default7]: iteration 2197/ 6200 | consumed samples: 2249728 | consumed tokens: 4607442944 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.836455E+00 | loss scale: 1024.0 | grad norm: 5.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.347 | TFLOPs: 42.17 | +[default7]: iteration 2198/ 6200 | consumed samples: 2250752 | consumed tokens: 4609540096 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.812191E+00 | loss scale: 1024.0 | grad norm: 7.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.386 | TFLOPs: 42.18 | +[default7]: iteration 2199/ 6200 | consumed samples: 2251776 | consumed tokens: 4611637248 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.862919E+00 | loss scale: 1024.0 | grad norm: 6.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.683 | TFLOPs: 42.27 | +[default7]: iteration 2200/ 6200 | consumed samples: 2252800 | consumed tokens: 4613734400 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.821108E+00 | loss scale: 1024.0 | grad norm: 6.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.352 | TFLOPs: 42.17 | +[default7]: iteration 2201/ 6200 | consumed samples: 2253824 | consumed tokens: 4615831552 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.828679E+00 | loss scale: 1024.0 | grad norm: 9.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.364 | TFLOPs: 42.17 | +[default7]: iteration 2202/ 6200 | consumed samples: 2254848 | consumed tokens: 4617928704 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.859239E+00 | loss scale: 1024.0 | grad norm: 5.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.760 | TFLOPs: 42.30 | +[default7]: iteration 2203/ 6200 | consumed samples: 2255872 | consumed tokens: 4620025856 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808195E+00 | loss scale: 1024.0 | grad norm: 5.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.932 | TFLOPs: 42.35 | +[default7]: iteration 2204/ 6200 | consumed samples: 2256896 | consumed tokens: 4622123008 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839898E+00 | loss scale: 1024.0 | grad norm: 5.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.729 | TFLOPs: 41.98 | +[default7]: iteration 2205/ 6200 | consumed samples: 2257920 | consumed tokens: 4624220160 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.838462E+00 | loss scale: 1024.0 | grad norm: 6.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.898 | TFLOPs: 42.34 | +[default7]: iteration 2206/ 6200 | consumed samples: 2258944 | consumed tokens: 4626317312 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.826022E+00 | loss scale: 1024.0 | grad norm: 7.614 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.925 | TFLOPs: 42.35 | +[default7]: iteration 2207/ 6200 | consumed samples: 2259968 | consumed tokens: 4628414464 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847892E+00 | loss scale: 1024.0 | grad norm: 6.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.909 | TFLOPs: 42.34 | +[default7]: iteration 2208/ 6200 | consumed samples: 2260992 | consumed tokens: 4630511616 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.845905E+00 | loss scale: 1024.0 | grad norm: 5.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.882 | TFLOPs: 42.33 | +[default7]: iteration 2209/ 6200 | consumed samples: 2262016 | consumed tokens: 4632608768 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.843881E+00 | loss scale: 1024.0 | grad norm: 6.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.833 | TFLOPs: 42.32 | +[default7]: iteration 2210/ 6200 | consumed samples: 2263040 | consumed tokens: 4634705920 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.820579E+00 | loss scale: 1024.0 | grad norm: 6.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.885 | TFLOPs: 42.33 | +[default7]: iteration 2211/ 6200 | consumed samples: 2264064 | consumed tokens: 4636803072 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.809795E+00 | loss scale: 1024.0 | grad norm: 6.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.714 | TFLOPs: 42.28 | +[default7]: iteration 2212/ 6200 | consumed samples: 2265088 | consumed tokens: 4638900224 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.867933E+00 | loss scale: 1024.0 | grad norm: 6.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.824 | TFLOPs: 42.31 | +[default7]: iteration 2213/ 6200 | consumed samples: 2266112 | consumed tokens: 4640997376 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.798831E+00 | loss scale: 1024.0 | grad norm: 5.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.557 | TFLOPs: 42.23 | +[default7]: iteration 2214/ 6200 | consumed samples: 2267136 | consumed tokens: 4643094528 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.831717E+00 | loss scale: 1024.0 | grad norm: 8.761 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.630 | TFLOPs: 42.26 | +[default7]: iteration 2215/ 6200 | consumed samples: 2268160 | consumed tokens: 4645191680 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.836575E+00 | loss scale: 1024.0 | grad norm: 7.938 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.767 | TFLOPs: 42.30 | +[default7]: iteration 2216/ 6200 | consumed samples: 2269184 | consumed tokens: 4647288832 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.829980E+00 | loss scale: 1024.0 | grad norm: 6.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.568 | TFLOPs: 42.24 | +[default7]: iteration 2217/ 6200 | consumed samples: 2270208 | consumed tokens: 4649385984 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.863915E+00 | loss scale: 1024.0 | grad norm: 9.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.920 | TFLOPs: 42.34 | +[default7]: iteration 2218/ 6200 | consumed samples: 2271232 | consumed tokens: 4651483136 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801094E+00 | loss scale: 1024.0 | grad norm: 7.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.834 | TFLOPs: 42.32 | +[default7]: iteration 2219/ 6200 | consumed samples: 2272256 | consumed tokens: 4653580288 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.844104E+00 | loss scale: 1024.0 | grad norm: 6.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.822 | TFLOPs: 42.31 | +[default7]: iteration 2220/ 6200 | consumed samples: 2273280 | consumed tokens: 4655677440 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.836969E+00 | loss scale: 1024.0 | grad norm: 6.988 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.019 | TFLOPs: 42.37 | +[default7]: iteration 2221/ 6200 | consumed samples: 2274304 | consumed tokens: 4657774592 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.828378E+00 | loss scale: 1024.0 | grad norm: 6.988 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.901 | TFLOPs: 42.34 | +[default7]: iteration 2222/ 6200 | consumed samples: 2275328 | consumed tokens: 4659871744 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.838475E+00 | loss scale: 1024.0 | grad norm: 6.803 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.921 | TFLOPs: 42.34 | +[default7]: iteration 2223/ 6200 | consumed samples: 2276352 | consumed tokens: 4661968896 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.829157E+00 | loss scale: 1024.0 | grad norm: 6.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.018 | TFLOPs: 42.37 | +[default7]: iteration 2224/ 6200 | consumed samples: 2277376 | consumed tokens: 4664066048 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.821676E+00 | loss scale: 1024.0 | grad norm: 5.913 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.908 | TFLOPs: 42.34 | +[default7]: iteration 2225/ 6200 | consumed samples: 2278400 | consumed tokens: 4666163200 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.823236E+00 | loss scale: 1024.0 | grad norm: 8.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.360 | TFLOPs: 42.17 | +[default7]: iteration 2226/ 6200 | consumed samples: 2279424 | consumed tokens: 4668260352 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.827133E+00 | loss scale: 1024.0 | grad norm: 7.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.992 | TFLOPs: 42.37 | +[default7]: iteration 2227/ 6200 | consumed samples: 2280448 | consumed tokens: 4670357504 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.833807E+00 | loss scale: 1024.0 | grad norm: 5.860 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.958 | TFLOPs: 42.36 | +[default7]: iteration 2228/ 6200 | consumed samples: 2281472 | consumed tokens: 4672454656 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.811373E+00 | loss scale: 1024.0 | grad norm: 5.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.581 | TFLOPs: 42.24 | +[default7]: iteration 2229/ 6200 | consumed samples: 2282496 | consumed tokens: 4674551808 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.855709E+00 | loss scale: 1024.0 | grad norm: 5.649 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.960 | TFLOPs: 42.36 | +[default7]: iteration 2230/ 6200 | consumed samples: 2283520 | consumed tokens: 4676648960 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.796553E+00 | loss scale: 1024.0 | grad norm: 5.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.169 | TFLOPs: 42.42 | +[default7]: iteration 2231/ 6200 | consumed samples: 2284544 | consumed tokens: 4678746112 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.847770E+00 | loss scale: 1024.0 | grad norm: 5.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.046 | TFLOPs: 42.38 | +[default7]: iteration 2232/ 6200 | consumed samples: 2285568 | consumed tokens: 4680843264 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.816773E+00 | loss scale: 1024.0 | grad norm: 6.732 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.183 | TFLOPs: 42.42 | +[default7]: iteration 2233/ 6200 | consumed samples: 2286592 | consumed tokens: 4682940416 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.851207E+00 | loss scale: 1024.0 | grad norm: 6.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.093 | TFLOPs: 42.40 | +[default7]: iteration 2234/ 6200 | consumed samples: 2287616 | consumed tokens: 4685037568 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.831465E+00 | loss scale: 1024.0 | grad norm: 6.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.985 | TFLOPs: 42.36 | +[default7]: iteration 2235/ 6200 | consumed samples: 2288640 | consumed tokens: 4687134720 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.826734E+00 | loss scale: 1024.0 | grad norm: 6.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.981 | TFLOPs: 42.36 | +[default7]: iteration 2236/ 6200 | consumed samples: 2289664 | consumed tokens: 4689231872 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.818556E+00 | loss scale: 1024.0 | grad norm: 5.940 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.058 | TFLOPs: 42.39 | +[default7]: iteration 2237/ 6200 | consumed samples: 2290688 | consumed tokens: 4691329024 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.813484E+00 | loss scale: 1024.0 | grad norm: 5.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.960 | TFLOPs: 42.36 | +[default7]: iteration 2238/ 6200 | consumed samples: 2291712 | consumed tokens: 4693426176 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.837354E+00 | loss scale: 1024.0 | grad norm: 5.962 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.958 | TFLOPs: 42.36 | +[default7]: iteration 2239/ 6200 | consumed samples: 2292736 | consumed tokens: 4695523328 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.780734E+00 | loss scale: 1024.0 | grad norm: 6.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.704 | TFLOPs: 42.28 | +[default7]: iteration 2240/ 6200 | consumed samples: 2293760 | consumed tokens: 4697620480 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807827E+00 | loss scale: 1024.0 | grad norm: 5.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.638 | TFLOPs: 42.26 | +[default7]: iteration 2241/ 6200 | consumed samples: 2294784 | consumed tokens: 4699717632 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.837626E+00 | loss scale: 1024.0 | grad norm: 5.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.826 | TFLOPs: 42.32 | +[default7]: iteration 2242/ 6200 | consumed samples: 2295808 | consumed tokens: 4701814784 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.824087E+00 | loss scale: 1024.0 | grad norm: 4.983 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.857 | TFLOPs: 42.32 | +[default7]: iteration 2243/ 6200 | consumed samples: 2296832 | consumed tokens: 4703911936 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.831319E+00 | loss scale: 1024.0 | grad norm: 6.027 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.698 | TFLOPs: 42.28 | +[default7]: iteration 2244/ 6200 | consumed samples: 2297856 | consumed tokens: 4706009088 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.841942E+00 | loss scale: 1024.0 | grad norm: 5.660 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.782 | TFLOPs: 42.30 | +[default7]: iteration 2245/ 6200 | consumed samples: 2298880 | consumed tokens: 4708106240 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.825712E+00 | loss scale: 1024.0 | grad norm: 5.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.792 | TFLOPs: 42.31 | +[default7]: iteration 2246/ 6200 | consumed samples: 2299904 | consumed tokens: 4710203392 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.840539E+00 | loss scale: 1024.0 | grad norm: 5.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.865 | TFLOPs: 42.33 | +[default7]: iteration 2247/ 6200 | consumed samples: 2300928 | consumed tokens: 4712300544 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.829067E+00 | loss scale: 1024.0 | grad norm: 5.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.061 | TFLOPs: 42.39 | +[default7]: iteration 2248/ 6200 | consumed samples: 2301952 | consumed tokens: 4714397696 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.837960E+00 | loss scale: 1024.0 | grad norm: 5.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.051 | TFLOPs: 42.38 | +[default7]: iteration 2249/ 6200 | consumed samples: 2302976 | consumed tokens: 4716494848 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.827132E+00 | loss scale: 1024.0 | grad norm: 5.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.947 | TFLOPs: 42.35 | +[default7]: iteration 2250/ 6200 | consumed samples: 2304000 | consumed tokens: 4718592000 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839255E+00 | loss scale: 1024.0 | grad norm: 5.937 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.084 | TFLOPs: 42.39 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 2250 | lm loss value: 3.513501E+00 | lm loss PPL: 3.356557E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 2250 | lm loss value: 1.729610E+00 | lm loss PPL: 5.638453E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 2250 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 17:34:10,816] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2250 is begin to save! +[default0]:[2022-10-06 17:34:10,821] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,223] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,252] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,283] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,311] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,340] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,367] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,427] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,455] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,483] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,511] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,570] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,600] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,715] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,742] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_24-model_00-model_states.pt... +[default7]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 17:34:11,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,854] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,881] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,909] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 17:34:11,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 17:34:11,910] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/mp_rank_00_model_states.pt +[default0]:[2022-10-06 17:34:11,910] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 17:34:11,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 17:34:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 17:34:12,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 17:34:12,123] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 17:34:12,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 17:34:12,113] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 17:34:12,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 17:34:12,125] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 17:34:12,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 17:34:12,185] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 17:34:12,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 17:34:12,175] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 17:34:12,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 17:34:12,195] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 17:34:12,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 17:34:12,143] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 17:34:12,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 17:34:12,194] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 17:34:12,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 17:34:12,177] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 17:34:12,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 17:34:12,200] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 17:34:12,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 17:34:12,174] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 17:34:12,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 17:34:12,202] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 17:34:12,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 17:34:12,220] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 17:34:12,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 17:34:12,180] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 17:34:12,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 17:34:12,264] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 17:34:12,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 17:34:12,258] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 17:34:12,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 17:34:12,286] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 17:34:12,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 17:34:12,263] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 17:34:12,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 17:34:12,219] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 17:34:12,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 17:34:12,239] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 17:34:12,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 17:34:12,288] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 17:34:12,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 17:34:12,249] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 17:34:12,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 17:34:12,238] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 17:34:12,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 17:34:12,250] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 17:34:12,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 17:34:12,317] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 17:34:12,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 17:34:12,246] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 17:34:12,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 17:34:12,290] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 17:34:12,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 17:34:12,288] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 17:34:12,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 17:34:12,325] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 17:34:12,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 17:34:12,358] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default3]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default6]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default2]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default1]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default7]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default0]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default0]: successfully saved checkpoint at iteration 2250 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default5]:[2022-10-06 17:34:12,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 17:34:12,430] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default4]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default6]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default3]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default1]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default2]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default0]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default2]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default4]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default5]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default6]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default4]:[2022-10-06 17:34:12,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 17:34:12,426] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2250/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default1]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default5]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default0]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default1]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default6]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default2]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default3]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default3]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default5]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default0]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default7]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default4]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default7]:[2022-10-06 17:34:12,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2250 is ready now! +[default7]:time (ms) | save-checkpoint: 1615.50 +[default7]: iteration 2251/ 6200 | consumed samples: 2305024 | consumed tokens: 4720689152 | elapsed time per iteration (s): 53.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.817767E+00 | loss scale: 1024.0 | grad norm: 5.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.199 | TFLOPs: 5.85 | +[default7]: iteration 2252/ 6200 | consumed samples: 2306048 | consumed tokens: 4722786304 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.849827E+00 | loss scale: 1024.0 | grad norm: 5.012 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.598 | TFLOPs: 42.25 | +[default7]: iteration 2253/ 6200 | consumed samples: 2307072 | consumed tokens: 4724883456 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.843369E+00 | loss scale: 1024.0 | grad norm: 5.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.582 | TFLOPs: 42.24 | +[default7]: iteration 2254/ 6200 | consumed samples: 2308096 | consumed tokens: 4726980608 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.814775E+00 | loss scale: 1024.0 | grad norm: 5.981 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.172 | TFLOPs: 42.42 | +[default7]: iteration 2255/ 6200 | consumed samples: 2309120 | consumed tokens: 4729077760 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.803055E+00 | loss scale: 1024.0 | grad norm: 5.880 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.838 | TFLOPs: 42.32 | +[default7]: iteration 2256/ 6200 | consumed samples: 2310144 | consumed tokens: 4731174912 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.846541E+00 | loss scale: 1024.0 | grad norm: 5.764 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.179 | TFLOPs: 42.42 | +[default7]: iteration 2257/ 6200 | consumed samples: 2311168 | consumed tokens: 4733272064 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.806485E+00 | loss scale: 1024.0 | grad norm: 6.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.881 | TFLOPs: 42.33 | +[default7]: iteration 2258/ 6200 | consumed samples: 2312192 | consumed tokens: 4735369216 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.831307E+00 | loss scale: 1024.0 | grad norm: 7.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.020 | TFLOPs: 42.37 | +[default7]: iteration 2259/ 6200 | consumed samples: 2313216 | consumed tokens: 4737466368 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.823318E+00 | loss scale: 1024.0 | grad norm: 5.970 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.263 | TFLOPs: 42.45 | +[default7]: iteration 2260/ 6200 | consumed samples: 2314240 | consumed tokens: 4739563520 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.812400E+00 | loss scale: 1024.0 | grad norm: 6.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.919 | TFLOPs: 42.34 | +[default7]: iteration 2261/ 6200 | consumed samples: 2315264 | consumed tokens: 4741660672 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857660E+00 | loss scale: 1024.0 | grad norm: 6.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.056 | TFLOPs: 42.39 | +[default7]: iteration 2262/ 6200 | consumed samples: 2316288 | consumed tokens: 4743757824 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.845518E+00 | loss scale: 1024.0 | grad norm: 6.043 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.777 | TFLOPs: 42.30 | +[default7]: iteration 2263/ 6200 | consumed samples: 2317312 | consumed tokens: 4745854976 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.835607E+00 | loss scale: 1024.0 | grad norm: 6.769 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.791 | TFLOPs: 42.30 | +[default7]: iteration 2264/ 6200 | consumed samples: 2318336 | consumed tokens: 4747952128 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839553E+00 | loss scale: 1024.0 | grad norm: 7.830 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.920 | TFLOPs: 42.34 | +[default7]: iteration 2265/ 6200 | consumed samples: 2319360 | consumed tokens: 4750049280 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.826252E+00 | loss scale: 1024.0 | grad norm: 5.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.988 | TFLOPs: 42.36 | +[default7]: iteration 2266/ 6200 | consumed samples: 2320384 | consumed tokens: 4752146432 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.815545E+00 | loss scale: 1024.0 | grad norm: 5.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.229 | TFLOPs: 42.44 | +[default7]: iteration 2267/ 6200 | consumed samples: 2321408 | consumed tokens: 4754243584 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.852409E+00 | loss scale: 1024.0 | grad norm: 6.875 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.070 | TFLOPs: 42.39 | +[default7]: iteration 2268/ 6200 | consumed samples: 2322432 | consumed tokens: 4756340736 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.834243E+00 | loss scale: 1024.0 | grad norm: 7.909 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.977 | TFLOPs: 42.36 | +[default7]: iteration 2269/ 6200 | consumed samples: 2323456 | consumed tokens: 4758437888 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.857979E+00 | loss scale: 1024.0 | grad norm: 5.997 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.995 | TFLOPs: 42.37 | +[default7]: iteration 2270/ 6200 | consumed samples: 2324480 | consumed tokens: 4760535040 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.840954E+00 | loss scale: 1024.0 | grad norm: 5.896 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.712 | TFLOPs: 42.28 | +[default7]: iteration 2271/ 6200 | consumed samples: 2325504 | consumed tokens: 4762632192 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.840795E+00 | loss scale: 1024.0 | grad norm: 8.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.600 | TFLOPs: 42.25 | +[default7]: iteration 2272/ 6200 | consumed samples: 2326528 | consumed tokens: 4764729344 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.854924E+00 | loss scale: 1024.0 | grad norm: 8.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.686 | TFLOPs: 42.27 | +[default7]: iteration 2273/ 6200 | consumed samples: 2327552 | consumed tokens: 4766826496 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.858636E+00 | loss scale: 1024.0 | grad norm: 8.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.470 | TFLOPs: 42.21 | +[default7]: iteration 2274/ 6200 | consumed samples: 2328576 | consumed tokens: 4768923648 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.842811E+00 | loss scale: 1024.0 | grad norm: 5.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.930 | TFLOPs: 42.35 | +[default7]: iteration 2275/ 6200 | consumed samples: 2329600 | consumed tokens: 4771020800 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.848464E+00 | loss scale: 1024.0 | grad norm: 8.071 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.638 | TFLOPs: 42.26 | +[default7]: iteration 2276/ 6200 | consumed samples: 2330624 | consumed tokens: 4773117952 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.817124E+00 | loss scale: 1024.0 | grad norm: 6.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.911 | TFLOPs: 42.04 | +[default7]: iteration 2277/ 6200 | consumed samples: 2331648 | consumed tokens: 4775215104 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.826916E+00 | loss scale: 1024.0 | grad norm: 7.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.525 | TFLOPs: 42.22 | +[default7]: iteration 2278/ 6200 | consumed samples: 2332672 | consumed tokens: 4777312256 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.830392E+00 | loss scale: 1024.0 | grad norm: 6.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.719 | TFLOPs: 42.28 | +[default7]: iteration 2279/ 6200 | consumed samples: 2333696 | consumed tokens: 4779409408 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.833430E+00 | loss scale: 1024.0 | grad norm: 6.999 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.250 | TFLOPs: 42.14 | +[default7]: iteration 2280/ 6200 | consumed samples: 2334720 | consumed tokens: 4781506560 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.832762E+00 | loss scale: 1024.0 | grad norm: 6.814 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.625 | TFLOPs: 42.25 | +[default7]: iteration 2281/ 6200 | consumed samples: 2335744 | consumed tokens: 4783603712 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.825470E+00 | loss scale: 1024.0 | grad norm: 6.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.487 | TFLOPs: 42.21 | +[default7]: iteration 2282/ 6200 | consumed samples: 2336768 | consumed tokens: 4785700864 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.822957E+00 | loss scale: 1024.0 | grad norm: 6.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.393 | TFLOPs: 42.18 | +[default7]: iteration 2283/ 6200 | consumed samples: 2337792 | consumed tokens: 4787798016 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.830866E+00 | loss scale: 1024.0 | grad norm: 6.917 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.468 | TFLOPs: 42.21 | +[default7]: iteration 2284/ 6200 | consumed samples: 2338816 | consumed tokens: 4789895168 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.872597E+00 | loss scale: 1024.0 | grad norm: 5.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.500 | TFLOPs: 42.22 | +[default7]: iteration 2285/ 6200 | consumed samples: 2339840 | consumed tokens: 4791992320 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.823899E+00 | loss scale: 1024.0 | grad norm: 6.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.459 | TFLOPs: 42.20 | +[default7]: iteration 2286/ 6200 | consumed samples: 2340864 | consumed tokens: 4794089472 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.846530E+00 | loss scale: 1024.0 | grad norm: 6.063 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.921 | TFLOPs: 42.34 | +[default7]: iteration 2287/ 6200 | consumed samples: 2341888 | consumed tokens: 4796186624 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.832288E+00 | loss scale: 1024.0 | grad norm: 6.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.086 | TFLOPs: 42.39 | +[default7]: iteration 2288/ 6200 | consumed samples: 2342912 | consumed tokens: 4798283776 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.829306E+00 | loss scale: 1024.0 | grad norm: 6.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.201 | TFLOPs: 42.12 | +[default7]: iteration 2289/ 6200 | consumed samples: 2343936 | consumed tokens: 4800380928 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795725E+00 | loss scale: 1024.0 | grad norm: 5.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.879 | TFLOPs: 42.33 | +[default7]: iteration 2290/ 6200 | consumed samples: 2344960 | consumed tokens: 4802478080 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.843922E+00 | loss scale: 1024.0 | grad norm: 6.798 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.131 | TFLOPs: 42.41 | +[default7]: iteration 2291/ 6200 | consumed samples: 2345984 | consumed tokens: 4804575232 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839035E+00 | loss scale: 1024.0 | grad norm: 6.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.985 | TFLOPs: 42.36 | +[default7]: iteration 2292/ 6200 | consumed samples: 2347008 | consumed tokens: 4806672384 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.804347E+00 | loss scale: 2048.0 | grad norm: 3.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.469 | TFLOPs: 42.21 | +[default7]: iteration 2293/ 6200 | consumed samples: 2348032 | consumed tokens: 4808769536 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781179E+00 | loss scale: 2048.0 | grad norm: 5.979 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.455 | TFLOPs: 42.20 | +[default7]: iteration 2294/ 6200 | consumed samples: 2349056 | consumed tokens: 4810866688 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839457E+00 | loss scale: 2048.0 | grad norm: 5.594 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.964 | TFLOPs: 42.36 | +[default7]: iteration 2295/ 6200 | consumed samples: 2350080 | consumed tokens: 4812963840 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839462E+00 | loss scale: 2048.0 | grad norm: 5.936 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.763 | TFLOPs: 42.30 | +[default7]: iteration 2296/ 6200 | consumed samples: 2351104 | consumed tokens: 4815060992 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.810812E+00 | loss scale: 2048.0 | grad norm: 6.688 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.673 | TFLOPs: 42.27 | +[default7]: iteration 2297/ 6200 | consumed samples: 2352128 | consumed tokens: 4817158144 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.827492E+00 | loss scale: 2048.0 | grad norm: 5.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.896 | TFLOPs: 42.34 | +[default7]: iteration 2298/ 6200 | consumed samples: 2353152 | consumed tokens: 4819255296 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.835995E+00 | loss scale: 2048.0 | grad norm: 7.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.035 | TFLOPs: 42.38 | +[default7]: iteration 2299/ 6200 | consumed samples: 2354176 | consumed tokens: 4821352448 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.806675E+00 | loss scale: 2048.0 | grad norm: 5.757 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.888 | TFLOPs: 42.33 | +[default7]: iteration 2300/ 6200 | consumed samples: 2355200 | consumed tokens: 4823449600 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.810767E+00 | loss scale: 2048.0 | grad norm: 6.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.938 | TFLOPs: 42.35 | +[default7]: iteration 2301/ 6200 | consumed samples: 2356224 | consumed tokens: 4825546752 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.821667E+00 | loss scale: 2048.0 | grad norm: 6.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.063 | TFLOPs: 42.39 | +[default7]: iteration 2302/ 6200 | consumed samples: 2357248 | consumed tokens: 4827643904 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.823438E+00 | loss scale: 2048.0 | grad norm: 7.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.452 | TFLOPs: 42.20 | +[default7]: iteration 2303/ 6200 | consumed samples: 2358272 | consumed tokens: 4829741056 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.812077E+00 | loss scale: 2048.0 | grad norm: 6.710 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.209 | TFLOPs: 42.43 | +[default7]: iteration 2304/ 6200 | consumed samples: 2359296 | consumed tokens: 4831838208 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808297E+00 | loss scale: 2048.0 | grad norm: 4.994 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.210 | TFLOPs: 42.43 | +[default7]: iteration 2305/ 6200 | consumed samples: 2360320 | consumed tokens: 4833935360 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793081E+00 | loss scale: 2048.0 | grad norm: 5.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.943 | TFLOPs: 42.35 | +[default7]: iteration 2306/ 6200 | consumed samples: 2361344 | consumed tokens: 4836032512 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.837324E+00 | loss scale: 2048.0 | grad norm: 5.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.639 | TFLOPs: 42.26 | +[default7]: iteration 2307/ 6200 | consumed samples: 2362368 | consumed tokens: 4838129664 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.800176E+00 | loss scale: 2048.0 | grad norm: 5.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.905 | TFLOPs: 42.34 | +[default7]: iteration 2308/ 6200 | consumed samples: 2363392 | consumed tokens: 4840226816 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.833483E+00 | loss scale: 2048.0 | grad norm: 5.046 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.034 | TFLOPs: 42.38 | +[default7]: iteration 2309/ 6200 | consumed samples: 2364416 | consumed tokens: 4842323968 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.834346E+00 | loss scale: 2048.0 | grad norm: 5.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.829 | TFLOPs: 42.32 | +[default7]: iteration 2310/ 6200 | consumed samples: 2365440 | consumed tokens: 4844421120 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.797612E+00 | loss scale: 2048.0 | grad norm: 5.872 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.642 | TFLOPs: 42.26 | +[default7]: iteration 2311/ 6200 | consumed samples: 2366464 | consumed tokens: 4846518272 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808495E+00 | loss scale: 2048.0 | grad norm: 6.832 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.836 | TFLOPs: 42.32 | +[default7]: iteration 2312/ 6200 | consumed samples: 2367488 | consumed tokens: 4848615424 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.790203E+00 | loss scale: 2048.0 | grad norm: 6.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.675 | TFLOPs: 42.27 | +[default7]: iteration 2313/ 6200 | consumed samples: 2368512 | consumed tokens: 4850712576 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.841649E+00 | loss scale: 2048.0 | grad norm: 7.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.864 | TFLOPs: 42.33 | +[default7]: iteration 2314/ 6200 | consumed samples: 2369536 | consumed tokens: 4852809728 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.816226E+00 | loss scale: 2048.0 | grad norm: 5.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.790 | TFLOPs: 42.30 | +[default7]: iteration 2315/ 6200 | consumed samples: 2370560 | consumed tokens: 4854906880 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.820286E+00 | loss scale: 2048.0 | grad norm: 6.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.710 | TFLOPs: 42.28 | +[default7]: iteration 2316/ 6200 | consumed samples: 2371584 | consumed tokens: 4857004032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.823466E+00 | loss scale: 2048.0 | grad norm: 5.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.956 | TFLOPs: 42.35 | +[default7]: iteration 2317/ 6200 | consumed samples: 2372608 | consumed tokens: 4859101184 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799149E+00 | loss scale: 2048.0 | grad norm: 5.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.012 | TFLOPs: 42.37 | +[default7]: iteration 2318/ 6200 | consumed samples: 2373632 | consumed tokens: 4861198336 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.803330E+00 | loss scale: 2048.0 | grad norm: 6.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.096 | TFLOPs: 42.40 | +[default7]: iteration 2319/ 6200 | consumed samples: 2374656 | consumed tokens: 4863295488 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.819308E+00 | loss scale: 2048.0 | grad norm: 5.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.666 | TFLOPs: 42.27 | +[default7]: iteration 2320/ 6200 | consumed samples: 2375680 | consumed tokens: 4865392640 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.805722E+00 | loss scale: 2048.0 | grad norm: 6.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.764 | TFLOPs: 42.30 | +[default7]: iteration 2321/ 6200 | consumed samples: 2376704 | consumed tokens: 4867489792 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.788229E+00 | loss scale: 2048.0 | grad norm: 6.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.023 | TFLOPs: 42.38 | +[default7]: iteration 2322/ 6200 | consumed samples: 2377728 | consumed tokens: 4869586944 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.814618E+00 | loss scale: 2048.0 | grad norm: 5.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.599 | TFLOPs: 42.25 | +[default7]: iteration 2323/ 6200 | consumed samples: 2378752 | consumed tokens: 4871684096 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.828492E+00 | loss scale: 2048.0 | grad norm: 4.848 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.961 | TFLOPs: 42.36 | +[default7]: iteration 2324/ 6200 | consumed samples: 2379776 | consumed tokens: 4873781248 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.805558E+00 | loss scale: 2048.0 | grad norm: 5.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.688 | TFLOPs: 42.27 | +[default7]: iteration 2325/ 6200 | consumed samples: 2380800 | consumed tokens: 4875878400 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.841851E+00 | loss scale: 2048.0 | grad norm: 5.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.033 | TFLOPs: 42.38 | +[default7]: iteration 2326/ 6200 | consumed samples: 2381824 | consumed tokens: 4877975552 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793317E+00 | loss scale: 2048.0 | grad norm: 6.969 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.599 | TFLOPs: 42.25 | +[default7]: iteration 2327/ 6200 | consumed samples: 2382848 | consumed tokens: 4880072704 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807101E+00 | loss scale: 2048.0 | grad norm: 6.969 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.711 | TFLOPs: 42.28 | +[default7]: iteration 2328/ 6200 | consumed samples: 2383872 | consumed tokens: 4882169856 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.822983E+00 | loss scale: 2048.0 | grad norm: 5.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.801 | TFLOPs: 42.31 | +[default7]: iteration 2329/ 6200 | consumed samples: 2384896 | consumed tokens: 4884267008 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808542E+00 | loss scale: 2048.0 | grad norm: 5.764 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.082 | TFLOPs: 42.39 | +[default7]: iteration 2330/ 6200 | consumed samples: 2385920 | consumed tokens: 4886364160 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807903E+00 | loss scale: 2048.0 | grad norm: 6.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.332 | TFLOPs: 42.16 | +[default7]: iteration 2331/ 6200 | consumed samples: 2386944 | consumed tokens: 4888461312 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.809023E+00 | loss scale: 2048.0 | grad norm: 5.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.743 | TFLOPs: 42.29 | +[default7]: iteration 2332/ 6200 | consumed samples: 2387968 | consumed tokens: 4890558464 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.832634E+00 | loss scale: 2048.0 | grad norm: 7.793 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.778 | TFLOPs: 42.30 | +[default7]: iteration 2333/ 6200 | consumed samples: 2388992 | consumed tokens: 4892655616 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.823218E+00 | loss scale: 2048.0 | grad norm: 5.951 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.684 | TFLOPs: 42.27 | +[default7]: iteration 2334/ 6200 | consumed samples: 2390016 | consumed tokens: 4894752768 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.811278E+00 | loss scale: 2048.0 | grad norm: 5.715 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.780 | TFLOPs: 42.30 | +[default7]: iteration 2335/ 6200 | consumed samples: 2391040 | consumed tokens: 4896849920 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.815726E+00 | loss scale: 2048.0 | grad norm: 7.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.903 | TFLOPs: 42.34 | +[default7]: iteration 2336/ 6200 | consumed samples: 2392064 | consumed tokens: 4898947072 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.835199E+00 | loss scale: 2048.0 | grad norm: 5.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.906 | TFLOPs: 42.34 | +[default7]: iteration 2337/ 6200 | consumed samples: 2393088 | consumed tokens: 4901044224 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.827506E+00 | loss scale: 2048.0 | grad norm: 6.904 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.495 | TFLOPs: 42.21 | +[default7]: iteration 2338/ 6200 | consumed samples: 2394112 | consumed tokens: 4903141376 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.819472E+00 | loss scale: 2048.0 | grad norm: 4.969 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.044 | TFLOPs: 42.38 | +[default7]: iteration 2339/ 6200 | consumed samples: 2395136 | consumed tokens: 4905238528 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.805784E+00 | loss scale: 2048.0 | grad norm: 6.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.994 | TFLOPs: 42.37 | +[default7]: iteration 2340/ 6200 | consumed samples: 2396160 | consumed tokens: 4907335680 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.843142E+00 | loss scale: 2048.0 | grad norm: 5.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.308 | TFLOPs: 42.16 | +[default7]: iteration 2341/ 6200 | consumed samples: 2397184 | consumed tokens: 4909432832 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.796748E+00 | loss scale: 2048.0 | grad norm: 6.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.066 | TFLOPs: 42.39 | +[default7]: iteration 2342/ 6200 | consumed samples: 2398208 | consumed tokens: 4911529984 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.792766E+00 | loss scale: 2048.0 | grad norm: 5.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.877 | TFLOPs: 42.33 | +[default7]: iteration 2343/ 6200 | consumed samples: 2399232 | consumed tokens: 4913627136 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.834680E+00 | loss scale: 2048.0 | grad norm: 6.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.079 | TFLOPs: 42.39 | +[default7]: iteration 2344/ 6200 | consumed samples: 2400256 | consumed tokens: 4915724288 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.802594E+00 | loss scale: 2048.0 | grad norm: 5.699 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.335 | TFLOPs: 42.17 | +[default7]: iteration 2345/ 6200 | consumed samples: 2401280 | consumed tokens: 4917821440 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.806334E+00 | loss scale: 2048.0 | grad norm: 5.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.332 | TFLOPs: 42.16 | +[default7]: iteration 2346/ 6200 | consumed samples: 2402304 | consumed tokens: 4919918592 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808894E+00 | loss scale: 2048.0 | grad norm: 5.038 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.148 | TFLOPs: 42.11 | +[default7]: iteration 2347/ 6200 | consumed samples: 2403328 | consumed tokens: 4922015744 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.817369E+00 | loss scale: 2048.0 | grad norm: 5.668 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.943 | TFLOPs: 42.05 | +[default7]: iteration 2348/ 6200 | consumed samples: 2404352 | consumed tokens: 4924112896 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.810385E+00 | loss scale: 2048.0 | grad norm: 5.879 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.091 | TFLOPs: 42.09 | +[default7]: iteration 2349/ 6200 | consumed samples: 2405376 | consumed tokens: 4926210048 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.821730E+00 | loss scale: 2048.0 | grad norm: 5.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.950 | TFLOPs: 42.05 | +[default7]: iteration 2350/ 6200 | consumed samples: 2406400 | consumed tokens: 4928307200 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.813317E+00 | loss scale: 2048.0 | grad norm: 5.865 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.659 | TFLOPs: 42.26 | +[default7]: iteration 2351/ 6200 | consumed samples: 2407424 | consumed tokens: 4930404352 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.815836E+00 | loss scale: 2048.0 | grad norm: 5.918 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.967 | TFLOPs: 42.36 | +[default7]: iteration 2352/ 6200 | consumed samples: 2408448 | consumed tokens: 4932501504 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.831060E+00 | loss scale: 2048.0 | grad norm: 6.945 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.693 | TFLOPs: 42.27 | +[default7]: iteration 2353/ 6200 | consumed samples: 2409472 | consumed tokens: 4934598656 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.864399E+00 | loss scale: 2048.0 | grad norm: 9.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.473 | TFLOPs: 42.21 | +[default7]: iteration 2354/ 6200 | consumed samples: 2410496 | consumed tokens: 4936695808 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808117E+00 | loss scale: 2048.0 | grad norm: 6.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.107 | TFLOPs: 42.10 | +[default7]: iteration 2355/ 6200 | consumed samples: 2411520 | consumed tokens: 4938792960 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.819113E+00 | loss scale: 2048.0 | grad norm: 5.744 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.858 | TFLOPs: 42.32 | +[default7]: iteration 2356/ 6200 | consumed samples: 2412544 | consumed tokens: 4940890112 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799275E+00 | loss scale: 2048.0 | grad norm: 6.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.524 | TFLOPs: 42.22 | +[default7]: iteration 2357/ 6200 | consumed samples: 2413568 | consumed tokens: 4942987264 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.826219E+00 | loss scale: 2048.0 | grad norm: 6.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.395 | TFLOPs: 42.18 | +[default7]: iteration 2358/ 6200 | consumed samples: 2414592 | consumed tokens: 4945084416 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.803320E+00 | loss scale: 2048.0 | grad norm: 6.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.842 | TFLOPs: 42.32 | +[default7]: iteration 2359/ 6200 | consumed samples: 2415616 | consumed tokens: 4947181568 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808846E+00 | loss scale: 2048.0 | grad norm: 5.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.756 | TFLOPs: 42.29 | +[default7]: iteration 2360/ 6200 | consumed samples: 2416640 | consumed tokens: 4949278720 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.824960E+00 | loss scale: 2048.0 | grad norm: 4.954 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.652 | TFLOPs: 41.96 | +[default7]: iteration 2361/ 6200 | consumed samples: 2417664 | consumed tokens: 4951375872 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.822583E+00 | loss scale: 2048.0 | grad norm: 5.738 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.887 | TFLOPs: 42.03 | +[default7]: iteration 2362/ 6200 | consumed samples: 2418688 | consumed tokens: 4953473024 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.803023E+00 | loss scale: 2048.0 | grad norm: 5.027 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.085 | TFLOPs: 42.09 | +[default7]: iteration 2363/ 6200 | consumed samples: 2419712 | consumed tokens: 4955570176 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.796498E+00 | loss scale: 2048.0 | grad norm: 4.835 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.711 | TFLOPs: 42.28 | +[default7]: iteration 2364/ 6200 | consumed samples: 2420736 | consumed tokens: 4957667328 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839353E+00 | loss scale: 2048.0 | grad norm: 5.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.905 | TFLOPs: 42.34 | +[default7]: iteration 2365/ 6200 | consumed samples: 2421760 | consumed tokens: 4959764480 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.818033E+00 | loss scale: 2048.0 | grad norm: 5.699 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.813 | TFLOPs: 42.31 | +[default7]: iteration 2366/ 6200 | consumed samples: 2422784 | consumed tokens: 4961861632 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.813810E+00 | loss scale: 2048.0 | grad norm: 5.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.474 | TFLOPs: 42.21 | +[default7]: iteration 2367/ 6200 | consumed samples: 2423808 | consumed tokens: 4963958784 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.832814E+00 | loss scale: 2048.0 | grad norm: 5.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.105 | TFLOPs: 42.10 | +[default7]: iteration 2368/ 6200 | consumed samples: 2424832 | consumed tokens: 4966055936 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.842135E+00 | loss scale: 2048.0 | grad norm: 6.689 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.096 | TFLOPs: 42.09 | +[default7]: iteration 2369/ 6200 | consumed samples: 2425856 | consumed tokens: 4968153088 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.810415E+00 | loss scale: 2048.0 | grad norm: 7.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.602 | TFLOPs: 42.25 | +[default7]: iteration 2370/ 6200 | consumed samples: 2426880 | consumed tokens: 4970250240 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.836989E+00 | loss scale: 2048.0 | grad norm: 5.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.733 | TFLOPs: 42.29 | +[default7]: iteration 2371/ 6200 | consumed samples: 2427904 | consumed tokens: 4972347392 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795054E+00 | loss scale: 2048.0 | grad norm: 5.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.593 | TFLOPs: 42.24 | +[default7]: iteration 2372/ 6200 | consumed samples: 2428928 | consumed tokens: 4974444544 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.797704E+00 | loss scale: 2048.0 | grad norm: 6.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.006 | TFLOPs: 42.07 | +[default7]: iteration 2373/ 6200 | consumed samples: 2429952 | consumed tokens: 4976541696 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839099E+00 | loss scale: 2048.0 | grad norm: 7.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.475 | TFLOPs: 42.21 | +[default7]: iteration 2374/ 6200 | consumed samples: 2430976 | consumed tokens: 4978638848 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.814465E+00 | loss scale: 2048.0 | grad norm: 6.702 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.103 | TFLOPs: 42.10 | +[default7]: iteration 2375/ 6200 | consumed samples: 2432000 | consumed tokens: 4980736000 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795672E+00 | loss scale: 2048.0 | grad norm: 6.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.845 | TFLOPs: 42.32 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 2375 | lm loss value: 3.514993E+00 | lm loss PPL: 3.361571E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 2375 | lm loss value: 1.713474E+00 | lm loss PPL: 5.548203E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 2376/ 6200 | consumed samples: 2433024 | consumed tokens: 4982833152 | elapsed time per iteration (s): 51.68 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799233E+00 | loss scale: 2048.0 | grad norm: 7.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.814 | TFLOPs: 6.04 | +[default7]: iteration 2377/ 6200 | consumed samples: 2434048 | consumed tokens: 4984930304 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.830039E+00 | loss scale: 2048.0 | grad norm: 5.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.569 | TFLOPs: 42.24 | +[default7]: iteration 2378/ 6200 | consumed samples: 2435072 | consumed tokens: 4987027456 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808768E+00 | loss scale: 2048.0 | grad norm: 6.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.511 | TFLOPs: 42.22 | +[default7]: iteration 2379/ 6200 | consumed samples: 2436096 | consumed tokens: 4989124608 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.827111E+00 | loss scale: 2048.0 | grad norm: 6.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.638 | TFLOPs: 42.26 | +[default7]: iteration 2380/ 6200 | consumed samples: 2437120 | consumed tokens: 4991221760 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.817563E+00 | loss scale: 2048.0 | grad norm: 5.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.673 | TFLOPs: 42.27 | +[default7]: iteration 2381/ 6200 | consumed samples: 2438144 | consumed tokens: 4993318912 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.800574E+00 | loss scale: 2048.0 | grad norm: 5.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.726 | TFLOPs: 42.28 | +[default7]: iteration 2382/ 6200 | consumed samples: 2439168 | consumed tokens: 4995416064 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.823609E+00 | loss scale: 2048.0 | grad norm: 5.975 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.585 | TFLOPs: 42.24 | +[default7]: iteration 2383/ 6200 | consumed samples: 2440192 | consumed tokens: 4997513216 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.817550E+00 | loss scale: 2048.0 | grad norm: 6.664 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.627 | TFLOPs: 42.25 | +[default7]: iteration 2384/ 6200 | consumed samples: 2441216 | consumed tokens: 4999610368 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.821123E+00 | loss scale: 2048.0 | grad norm: 5.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.617 | TFLOPs: 42.25 | +[default7]: iteration 2385/ 6200 | consumed samples: 2442240 | consumed tokens: 5001707520 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.809337E+00 | loss scale: 2048.0 | grad norm: 6.950 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.524 | TFLOPs: 42.22 | +[default7]: iteration 2386/ 6200 | consumed samples: 2443264 | consumed tokens: 5003804672 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807860E+00 | loss scale: 2048.0 | grad norm: 6.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.578 | TFLOPs: 42.24 | +[default7]: iteration 2387/ 6200 | consumed samples: 2444288 | consumed tokens: 5005901824 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.803748E+00 | loss scale: 2048.0 | grad norm: 6.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.398 | TFLOPs: 42.18 | +[default7]: iteration 2388/ 6200 | consumed samples: 2445312 | consumed tokens: 5007998976 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.785695E+00 | loss scale: 2048.0 | grad norm: 6.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.807 | TFLOPs: 42.31 | +[default7]: iteration 2389/ 6200 | consumed samples: 2446336 | consumed tokens: 5010096128 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.809059E+00 | loss scale: 2048.0 | grad norm: 6.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.487 | TFLOPs: 42.21 | +[default7]: iteration 2390/ 6200 | consumed samples: 2447360 | consumed tokens: 5012193280 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807757E+00 | loss scale: 2048.0 | grad norm: 7.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.286 | TFLOPs: 42.15 | +[default7]: iteration 2391/ 6200 | consumed samples: 2448384 | consumed tokens: 5014290432 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.817891E+00 | loss scale: 2048.0 | grad norm: 5.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.430 | TFLOPs: 42.19 | +[default7]: iteration 2392/ 6200 | consumed samples: 2449408 | consumed tokens: 5016387584 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794595E+00 | loss scale: 2048.0 | grad norm: 6.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.001 | TFLOPs: 42.37 | +[default7]: iteration 2393/ 6200 | consumed samples: 2450432 | consumed tokens: 5018484736 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.810515E+00 | loss scale: 2048.0 | grad norm: 5.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.895 | TFLOPs: 42.34 | +[default7]: iteration 2394/ 6200 | consumed samples: 2451456 | consumed tokens: 5020581888 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.837222E+00 | loss scale: 2048.0 | grad norm: 5.894 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.736 | TFLOPs: 42.29 | +[default7]: iteration 2395/ 6200 | consumed samples: 2452480 | consumed tokens: 5022679040 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.789470E+00 | loss scale: 2048.0 | grad norm: 5.020 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.363 | TFLOPs: 42.17 | +[default7]: iteration 2396/ 6200 | consumed samples: 2453504 | consumed tokens: 5024776192 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.838915E+00 | loss scale: 2048.0 | grad norm: 7.019 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.631 | TFLOPs: 42.26 | +[default7]: iteration 2397/ 6200 | consumed samples: 2454528 | consumed tokens: 5026873344 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.845227E+00 | loss scale: 2048.0 | grad norm: 6.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.508 | TFLOPs: 42.22 | +[default7]: iteration 2398/ 6200 | consumed samples: 2455552 | consumed tokens: 5028970496 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.804265E+00 | loss scale: 2048.0 | grad norm: 6.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.322 | TFLOPs: 42.16 | +[default7]: iteration 2399/ 6200 | consumed samples: 2456576 | consumed tokens: 5031067648 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808133E+00 | loss scale: 2048.0 | grad norm: 6.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.397 | TFLOPs: 42.18 | +[default7]: iteration 2400/ 6200 | consumed samples: 2457600 | consumed tokens: 5033164800 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.803201E+00 | loss scale: 2048.0 | grad norm: 5.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.929 | TFLOPs: 42.04 | +[default7]: iteration 2401/ 6200 | consumed samples: 2458624 | consumed tokens: 5035261952 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.804121E+00 | loss scale: 2048.0 | grad norm: 5.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.502 | TFLOPs: 42.22 | +[default7]: iteration 2402/ 6200 | consumed samples: 2459648 | consumed tokens: 5037359104 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.798521E+00 | loss scale: 2048.0 | grad norm: 5.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.262 | TFLOPs: 42.45 | +[default7]: iteration 2403/ 6200 | consumed samples: 2460672 | consumed tokens: 5039456256 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.805128E+00 | loss scale: 2048.0 | grad norm: 6.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.972 | TFLOPs: 42.36 | +[default7]: iteration 2404/ 6200 | consumed samples: 2461696 | consumed tokens: 5041553408 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799948E+00 | loss scale: 2048.0 | grad norm: 6.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.775 | TFLOPs: 42.30 | +[default7]: iteration 2405/ 6200 | consumed samples: 2462720 | consumed tokens: 5043650560 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801584E+00 | loss scale: 2048.0 | grad norm: 5.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.136 | TFLOPs: 42.41 | +[default7]: iteration 2406/ 6200 | consumed samples: 2463744 | consumed tokens: 5045747712 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807791E+00 | loss scale: 2048.0 | grad norm: 6.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.337 | TFLOPs: 42.47 | +[default7]: iteration 2407/ 6200 | consumed samples: 2464768 | consumed tokens: 5047844864 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.829054E+00 | loss scale: 2048.0 | grad norm: 5.937 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.911 | TFLOPs: 42.34 | +[default7]: iteration 2408/ 6200 | consumed samples: 2465792 | consumed tokens: 5049942016 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.827582E+00 | loss scale: 2048.0 | grad norm: 6.071 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.320 | TFLOPs: 42.47 | +[default7]: iteration 2409/ 6200 | consumed samples: 2466816 | consumed tokens: 5052039168 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.804752E+00 | loss scale: 2048.0 | grad norm: 5.634 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.151 | TFLOPs: 42.41 | +[default7]: iteration 2410/ 6200 | consumed samples: 2467840 | consumed tokens: 5054136320 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.842227E+00 | loss scale: 2048.0 | grad norm: 5.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.159 | TFLOPs: 42.42 | +[default7]: iteration 2411/ 6200 | consumed samples: 2468864 | consumed tokens: 5056233472 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.815344E+00 | loss scale: 2048.0 | grad norm: 8.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.561 | TFLOPs: 42.23 | +[default7]: iteration 2412/ 6200 | consumed samples: 2469888 | consumed tokens: 5058330624 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.817373E+00 | loss scale: 2048.0 | grad norm: 5.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.871 | TFLOPs: 42.33 | +[default7]: iteration 2413/ 6200 | consumed samples: 2470912 | consumed tokens: 5060427776 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.824366E+00 | loss scale: 2048.0 | grad norm: 5.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.858 | TFLOPs: 42.33 | +[default7]: iteration 2414/ 6200 | consumed samples: 2471936 | consumed tokens: 5062524928 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.787532E+00 | loss scale: 2048.0 | grad norm: 5.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.099 | TFLOPs: 42.40 | +[default7]: iteration 2415/ 6200 | consumed samples: 2472960 | consumed tokens: 5064622080 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.819058E+00 | loss scale: 2048.0 | grad norm: 5.701 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.981 | TFLOPs: 42.36 | +[default7]: iteration 2416/ 6200 | consumed samples: 2473984 | consumed tokens: 5066719232 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.839566E+00 | loss scale: 2048.0 | grad norm: 6.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.507 | TFLOPs: 42.22 | +[default7]: iteration 2417/ 6200 | consumed samples: 2475008 | consumed tokens: 5068816384 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799796E+00 | loss scale: 2048.0 | grad norm: 5.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.777 | TFLOPs: 42.30 | +[default7]: iteration 2418/ 6200 | consumed samples: 2476032 | consumed tokens: 5070913536 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.804973E+00 | loss scale: 2048.0 | grad norm: 5.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.626 | TFLOPs: 42.25 | +[default7]: iteration 2419/ 6200 | consumed samples: 2477056 | consumed tokens: 5073010688 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.826522E+00 | loss scale: 2048.0 | grad norm: 5.847 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.987 | TFLOPs: 42.06 | +[default7]: iteration 2420/ 6200 | consumed samples: 2478080 | consumed tokens: 5075107840 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.800703E+00 | loss scale: 2048.0 | grad norm: 5.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.318 | TFLOPs: 42.16 | +[default7]: iteration 2421/ 6200 | consumed samples: 2479104 | consumed tokens: 5077204992 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.811388E+00 | loss scale: 2048.0 | grad norm: 5.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.148 | TFLOPs: 42.11 | +[default7]: iteration 2422/ 6200 | consumed samples: 2480128 | consumed tokens: 5079302144 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.809143E+00 | loss scale: 2048.0 | grad norm: 5.607 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.419 | TFLOPs: 42.19 | +[default7]: iteration 2423/ 6200 | consumed samples: 2481152 | consumed tokens: 5081399296 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.827829E+00 | loss scale: 2048.0 | grad norm: 6.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.710 | TFLOPs: 41.98 | +[default7]: iteration 2424/ 6200 | consumed samples: 2482176 | consumed tokens: 5083496448 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.797196E+00 | loss scale: 2048.0 | grad norm: 5.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.039 | TFLOPs: 42.08 | +[default7]: iteration 2425/ 6200 | consumed samples: 2483200 | consumed tokens: 5085593600 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.800115E+00 | loss scale: 2048.0 | grad norm: 6.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.933 | TFLOPs: 42.04 | +[default7]: iteration 2426/ 6200 | consumed samples: 2484224 | consumed tokens: 5087690752 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.817962E+00 | loss scale: 2048.0 | grad norm: 5.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.515 | TFLOPs: 42.22 | +[default7]: iteration 2427/ 6200 | consumed samples: 2485248 | consumed tokens: 5089787904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.816960E+00 | loss scale: 2048.0 | grad norm: 5.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.697 | TFLOPs: 42.28 | +[default7]: iteration 2428/ 6200 | consumed samples: 2486272 | consumed tokens: 5091885056 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807748E+00 | loss scale: 2048.0 | grad norm: 5.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.633 | TFLOPs: 42.26 | +[default7]: iteration 2429/ 6200 | consumed samples: 2487296 | consumed tokens: 5093982208 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.800416E+00 | loss scale: 2048.0 | grad norm: 5.763 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.692 | TFLOPs: 42.27 | +[default7]: iteration 2430/ 6200 | consumed samples: 2488320 | consumed tokens: 5096079360 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.810951E+00 | loss scale: 2048.0 | grad norm: 5.851 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.683 | TFLOPs: 42.27 | +[default7]: iteration 2431/ 6200 | consumed samples: 2489344 | consumed tokens: 5098176512 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807098E+00 | loss scale: 2048.0 | grad norm: 5.721 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.386 | TFLOPs: 42.18 | +[default7]: iteration 2432/ 6200 | consumed samples: 2490368 | consumed tokens: 5100273664 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.818269E+00 | loss scale: 2048.0 | grad norm: 5.744 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.942 | TFLOPs: 42.35 | +[default7]: iteration 2433/ 6200 | consumed samples: 2491392 | consumed tokens: 5102370816 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.788421E+00 | loss scale: 2048.0 | grad norm: 5.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.924 | TFLOPs: 42.35 | +[default7]: iteration 2434/ 6200 | consumed samples: 2492416 | consumed tokens: 5104467968 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.802397E+00 | loss scale: 2048.0 | grad norm: 6.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.303 | TFLOPs: 42.46 | +[default7]: iteration 2435/ 6200 | consumed samples: 2493440 | consumed tokens: 5106565120 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.823261E+00 | loss scale: 2048.0 | grad norm: 7.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.279 | TFLOPs: 42.45 | +[default7]: iteration 2436/ 6200 | consumed samples: 2494464 | consumed tokens: 5108662272 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.824203E+00 | loss scale: 2048.0 | grad norm: 6.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.123 | TFLOPs: 42.41 | +[default7]: iteration 2437/ 6200 | consumed samples: 2495488 | consumed tokens: 5110759424 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.824849E+00 | loss scale: 2048.0 | grad norm: 6.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.885 | TFLOPs: 42.33 | +[default7]: iteration 2438/ 6200 | consumed samples: 2496512 | consumed tokens: 5112856576 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.819113E+00 | loss scale: 2048.0 | grad norm: 6.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.828 | TFLOPs: 42.32 | +[default7]: iteration 2439/ 6200 | consumed samples: 2497536 | consumed tokens: 5114953728 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801005E+00 | loss scale: 2048.0 | grad norm: 5.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.970 | TFLOPs: 42.36 | +[default7]: iteration 2440/ 6200 | consumed samples: 2498560 | consumed tokens: 5117050880 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793764E+00 | loss scale: 2048.0 | grad norm: 6.637 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.968 | TFLOPs: 42.36 | +[default7]: iteration 2441/ 6200 | consumed samples: 2499584 | consumed tokens: 5119148032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.820692E+00 | loss scale: 2048.0 | grad norm: 6.042 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.002 | TFLOPs: 42.37 | +[default7]: iteration 2442/ 6200 | consumed samples: 2500608 | consumed tokens: 5121245184 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801712E+00 | loss scale: 2048.0 | grad norm: 5.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.170 | TFLOPs: 42.42 | +[default7]: iteration 2443/ 6200 | consumed samples: 2501632 | consumed tokens: 5123342336 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.816714E+00 | loss scale: 2048.0 | grad norm: 6.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.919 | TFLOPs: 42.34 | +[default7]: iteration 2444/ 6200 | consumed samples: 2502656 | consumed tokens: 5125439488 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801635E+00 | loss scale: 2048.0 | grad norm: 5.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.995 | TFLOPs: 42.37 | +[default7]: iteration 2445/ 6200 | consumed samples: 2503680 | consumed tokens: 5127536640 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778312E+00 | loss scale: 2048.0 | grad norm: 5.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.887 | TFLOPs: 42.33 | +[default7]: iteration 2446/ 6200 | consumed samples: 2504704 | consumed tokens: 5129633792 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.814790E+00 | loss scale: 2048.0 | grad norm: 6.884 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.026 | TFLOPs: 42.38 | +[default7]: iteration 2447/ 6200 | consumed samples: 2505728 | consumed tokens: 5131730944 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.797808E+00 | loss scale: 2048.0 | grad norm: 7.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.814 | TFLOPs: 42.31 | +[default7]: iteration 2448/ 6200 | consumed samples: 2506752 | consumed tokens: 5133828096 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770874E+00 | loss scale: 2048.0 | grad norm: 8.023 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.562 | TFLOPs: 42.23 | +[default7]: iteration 2449/ 6200 | consumed samples: 2507776 | consumed tokens: 5135925248 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.836344E+00 | loss scale: 2048.0 | grad norm: 6.828 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.679 | TFLOPs: 42.27 | +[default7]: iteration 2450/ 6200 | consumed samples: 2508800 | consumed tokens: 5138022400 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.805343E+00 | loss scale: 2048.0 | grad norm: 5.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.525 | TFLOPs: 42.22 | +[default7]: iteration 2451/ 6200 | consumed samples: 2509824 | consumed tokens: 5140119552 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.796237E+00 | loss scale: 2048.0 | grad norm: 6.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.713 | TFLOPs: 42.28 | +[default7]: iteration 2452/ 6200 | consumed samples: 2510848 | consumed tokens: 5142216704 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795430E+00 | loss scale: 2048.0 | grad norm: 5.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.838 | TFLOPs: 42.32 | +[default7]: iteration 2453/ 6200 | consumed samples: 2511872 | consumed tokens: 5144313856 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.834228E+00 | loss scale: 2048.0 | grad norm: 6.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.876 | TFLOPs: 42.33 | +[default7]: iteration 2454/ 6200 | consumed samples: 2512896 | consumed tokens: 5146411008 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808864E+00 | loss scale: 2048.0 | grad norm: 6.687 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.821 | TFLOPs: 42.31 | +[default7]: iteration 2455/ 6200 | consumed samples: 2513920 | consumed tokens: 5148508160 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.809932E+00 | loss scale: 2048.0 | grad norm: 6.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.940 | TFLOPs: 42.35 | +[default7]: iteration 2456/ 6200 | consumed samples: 2514944 | consumed tokens: 5150605312 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791056E+00 | loss scale: 2048.0 | grad norm: 8.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.847 | TFLOPs: 42.32 | +[default7]: iteration 2457/ 6200 | consumed samples: 2515968 | consumed tokens: 5152702464 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.819566E+00 | loss scale: 2048.0 | grad norm: 7.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.792 | TFLOPs: 42.30 | +[default7]: iteration 2458/ 6200 | consumed samples: 2516992 | consumed tokens: 5154799616 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.796848E+00 | loss scale: 2048.0 | grad norm: 6.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.976 | TFLOPs: 42.36 | +[default7]: iteration 2459/ 6200 | consumed samples: 2518016 | consumed tokens: 5156896768 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.837210E+00 | loss scale: 2048.0 | grad norm: 7.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.207 | TFLOPs: 42.43 | +[default7]: iteration 2460/ 6200 | consumed samples: 2519040 | consumed tokens: 5158993920 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.810546E+00 | loss scale: 2048.0 | grad norm: 6.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.215 | TFLOPs: 42.43 | +[default7]: iteration 2461/ 6200 | consumed samples: 2520064 | consumed tokens: 5161091072 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.815934E+00 | loss scale: 2048.0 | grad norm: 5.019 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.971 | TFLOPs: 42.36 | +[default7]: iteration 2462/ 6200 | consumed samples: 2521088 | consumed tokens: 5163188224 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.811225E+00 | loss scale: 2048.0 | grad norm: 5.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.174 | TFLOPs: 42.42 | +[default7]: iteration 2463/ 6200 | consumed samples: 2522112 | consumed tokens: 5165285376 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.789619E+00 | loss scale: 2048.0 | grad norm: 4.936 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.848 | TFLOPs: 42.32 | +[default7]: iteration 2464/ 6200 | consumed samples: 2523136 | consumed tokens: 5167382528 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.814422E+00 | loss scale: 2048.0 | grad norm: 5.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.072 | TFLOPs: 42.39 | +[default7]: iteration 2465/ 6200 | consumed samples: 2524160 | consumed tokens: 5169479680 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.836953E+00 | loss scale: 2048.0 | grad norm: 5.958 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.124 | TFLOPs: 42.41 | +[default7]: iteration 2466/ 6200 | consumed samples: 2525184 | consumed tokens: 5171576832 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.835121E+00 | loss scale: 2048.0 | grad norm: 6.005 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.961 | TFLOPs: 42.36 | +[default7]: iteration 2467/ 6200 | consumed samples: 2526208 | consumed tokens: 5173673984 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.804621E+00 | loss scale: 2048.0 | grad norm: 5.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.802 | TFLOPs: 42.31 | +[default7]: iteration 2468/ 6200 | consumed samples: 2527232 | consumed tokens: 5175771136 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808059E+00 | loss scale: 2048.0 | grad norm: 5.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.000 | TFLOPs: 42.37 | +[default7]: iteration 2469/ 6200 | consumed samples: 2528256 | consumed tokens: 5177868288 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.824188E+00 | loss scale: 2048.0 | grad norm: 5.724 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.925 | TFLOPs: 42.35 | +[default7]: iteration 2470/ 6200 | consumed samples: 2529280 | consumed tokens: 5179965440 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.820963E+00 | loss scale: 2048.0 | grad norm: 6.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.940 | TFLOPs: 42.35 | +[default7]: iteration 2471/ 6200 | consumed samples: 2530304 | consumed tokens: 5182062592 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.820878E+00 | loss scale: 2048.0 | grad norm: 6.780 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.979 | TFLOPs: 42.36 | +[default7]: iteration 2472/ 6200 | consumed samples: 2531328 | consumed tokens: 5184159744 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807079E+00 | loss scale: 2048.0 | grad norm: 5.876 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.952 | TFLOPs: 42.35 | +[default7]: iteration 2473/ 6200 | consumed samples: 2532352 | consumed tokens: 5186256896 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791759E+00 | loss scale: 2048.0 | grad norm: 6.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.978 | TFLOPs: 42.36 | +[default7]: iteration 2474/ 6200 | consumed samples: 2533376 | consumed tokens: 5188354048 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.830343E+00 | loss scale: 2048.0 | grad norm: 5.977 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.816 | TFLOPs: 42.31 | +[default7]: iteration 2475/ 6200 | consumed samples: 2534400 | consumed tokens: 5190451200 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.816627E+00 | loss scale: 2048.0 | grad norm: 6.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.839 | TFLOPs: 42.32 | +[default7]: iteration 2476/ 6200 | consumed samples: 2535424 | consumed tokens: 5192548352 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.812581E+00 | loss scale: 2048.0 | grad norm: 5.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.683 | TFLOPs: 42.27 | +[default7]: iteration 2477/ 6200 | consumed samples: 2536448 | consumed tokens: 5194645504 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.815916E+00 | loss scale: 2048.0 | grad norm: 6.914 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.810 | TFLOPs: 42.31 | +[default7]: iteration 2478/ 6200 | consumed samples: 2537472 | consumed tokens: 5196742656 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.821939E+00 | loss scale: 2048.0 | grad norm: 7.072 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.891 | TFLOPs: 42.34 | +[default7]: iteration 2479/ 6200 | consumed samples: 2538496 | consumed tokens: 5198839808 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793913E+00 | loss scale: 2048.0 | grad norm: 6.047 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.727 | TFLOPs: 42.29 | +[default7]: iteration 2480/ 6200 | consumed samples: 2539520 | consumed tokens: 5200936960 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808645E+00 | loss scale: 2048.0 | grad norm: 5.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.905 | TFLOPs: 42.34 | +[default7]: iteration 2481/ 6200 | consumed samples: 2540544 | consumed tokens: 5203034112 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791651E+00 | loss scale: 2048.0 | grad norm: 6.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.862 | TFLOPs: 42.33 | +[default7]: iteration 2482/ 6200 | consumed samples: 2541568 | consumed tokens: 5205131264 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.829832E+00 | loss scale: 2048.0 | grad norm: 5.845 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.879 | TFLOPs: 42.33 | +[default7]: iteration 2483/ 6200 | consumed samples: 2542592 | consumed tokens: 5207228416 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791566E+00 | loss scale: 2048.0 | grad norm: 5.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.860 | TFLOPs: 42.33 | +[default7]: iteration 2484/ 6200 | consumed samples: 2543616 | consumed tokens: 5209325568 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793820E+00 | loss scale: 2048.0 | grad norm: 6.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.903 | TFLOPs: 42.34 | +[default7]: iteration 2485/ 6200 | consumed samples: 2544640 | consumed tokens: 5211422720 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.826290E+00 | loss scale: 2048.0 | grad norm: 5.967 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.956 | TFLOPs: 42.35 | +[default7]: iteration 2486/ 6200 | consumed samples: 2545664 | consumed tokens: 5213519872 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.796487E+00 | loss scale: 2048.0 | grad norm: 6.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.854 | TFLOPs: 42.32 | +[default7]: iteration 2487/ 6200 | consumed samples: 2546688 | consumed tokens: 5215617024 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770884E+00 | loss scale: 2048.0 | grad norm: 5.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.796 | TFLOPs: 42.31 | +[default7]: iteration 2488/ 6200 | consumed samples: 2547712 | consumed tokens: 5217714176 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.821686E+00 | loss scale: 2048.0 | grad norm: 5.884 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.897 | TFLOPs: 42.34 | +[default7]: iteration 2489/ 6200 | consumed samples: 2548736 | consumed tokens: 5219811328 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.787790E+00 | loss scale: 2048.0 | grad norm: 5.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.863 | TFLOPs: 42.33 | +[default7]: iteration 2490/ 6200 | consumed samples: 2549760 | consumed tokens: 5221908480 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.796231E+00 | loss scale: 2048.0 | grad norm: 6.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.865 | TFLOPs: 42.33 | +[default7]: iteration 2491/ 6200 | consumed samples: 2550784 | consumed tokens: 5224005632 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.796816E+00 | loss scale: 2048.0 | grad norm: 5.869 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.996 | TFLOPs: 42.37 | +[default7]: iteration 2492/ 6200 | consumed samples: 2551808 | consumed tokens: 5226102784 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808405E+00 | loss scale: 2048.0 | grad norm: 5.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.968 | TFLOPs: 42.36 | +[default7]: iteration 2493/ 6200 | consumed samples: 2552832 | consumed tokens: 5228199936 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.788980E+00 | loss scale: 2048.0 | grad norm: 5.897 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.969 | TFLOPs: 42.36 | +[default7]: iteration 2494/ 6200 | consumed samples: 2553856 | consumed tokens: 5230297088 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.809036E+00 | loss scale: 2048.0 | grad norm: 5.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.026 | TFLOPs: 42.38 | +[default7]: iteration 2495/ 6200 | consumed samples: 2554880 | consumed tokens: 5232394240 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.802634E+00 | loss scale: 2048.0 | grad norm: 5.661 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.863 | TFLOPs: 42.33 | +[default7]: iteration 2496/ 6200 | consumed samples: 2555904 | consumed tokens: 5234491392 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801007E+00 | loss scale: 2048.0 | grad norm: 9.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.818 | TFLOPs: 42.31 | +[default7]: iteration 2497/ 6200 | consumed samples: 2556928 | consumed tokens: 5236588544 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794924E+00 | loss scale: 2048.0 | grad norm: 8.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.859 | TFLOPs: 42.33 | +[default7]: iteration 2498/ 6200 | consumed samples: 2557952 | consumed tokens: 5238685696 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791962E+00 | loss scale: 2048.0 | grad norm: 5.634 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.978 | TFLOPs: 42.36 | +[default7]: iteration 2499/ 6200 | consumed samples: 2558976 | consumed tokens: 5240782848 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.830936E+00 | loss scale: 2048.0 | grad norm: 7.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.842 | TFLOPs: 42.32 | +[default7]: iteration 2500/ 6200 | consumed samples: 2560000 | consumed tokens: 5242880000 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.824124E+00 | loss scale: 2048.0 | grad norm: 12.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.683 | TFLOPs: 42.27 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 2500 | lm loss value: 3.521706E+00 | lm loss PPL: 3.384213E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 2500 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 18:06:26,183] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2500 is begin to save! +[default0]:[2022-10-06 18:06:26,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_01-model_00-model_states.pt... +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 2500 | lm loss value: 1.704377E+00 | lm loss PPL: 5.497957E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:[2022-10-06 18:06:26,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:26,648] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:26,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:26,679] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:26,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:26,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:26,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:26,735] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:26,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:26,764] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:26,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:26,791] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:26,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:26,820] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:26,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:26,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:26,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:26,877] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:26,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:26,905] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:26,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:26,933] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:26,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:26,962] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:26,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:26,989] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:27,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:27,017] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:27,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:27,045] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:27,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:27,073] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:27,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:27,099] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:27,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:27,127] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:27,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:27,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:27,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:27,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:27,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:27,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:27,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:27,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:27,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:27,277] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_25-model_00-model_states.pt... +[default1]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 18:06:27,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:27,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:27,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:27,337] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 18:06:27,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 18:06:27,339] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/mp_rank_00_model_states.pt +[default0]:[2022-10-06 18:06:27,339] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 18:06:27,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 18:06:27,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 18:06:27,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 18:06:27,551] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 18:06:27,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 18:06:27,579] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 18:06:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 18:06:27,576] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 18:06:27,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 18:06:27,560] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 18:06:27,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 18:06:27,558] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 18:06:27,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 18:06:27,600] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 18:06:27,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 18:06:27,597] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 18:06:27,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 18:06:27,582] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 18:06:27,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 18:06:27,563] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 18:06:27,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 18:06:27,558] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 18:06:27,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 18:06:27,665] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 18:06:27,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 18:06:27,663] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 18:06:27,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 18:06:27,649] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 18:06:27,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 18:06:27,666] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 18:06:27,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 18:06:27,671] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 18:06:27,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 18:06:27,655] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 18:06:27,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 18:06:27,644] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 18:06:27,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 18:06:27,672] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 18:06:27,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 18:06:27,666] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 18:06:27,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 18:06:27,670] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 18:06:27,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 18:06:27,676] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 18:06:27,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 18:06:27,667] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 18:06:27,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 18:06:27,653] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 18:06:27,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 18:06:27,670] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 18:06:27,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 18:06:27,685] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 18:06:27,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 18:06:27,695] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 18:06:27,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 18:06:27,671] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 18:06:27,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 18:06:27,766] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default5]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default2]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default6]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default6]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default2]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:[2022-10-06 18:06:27,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 18:06:27,799] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:time (ms) | save-checkpoint: 1625.44 +[default1]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default5]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default2]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default4]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default3]:[2022-10-06 18:06:27,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 18:06:27,767] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default2]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default1]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default3]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default4]:[2022-10-06 18:06:27,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 18:06:27,792] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default4]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default5]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default6]:[2022-10-06 18:06:27,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 18:06:27,807] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2500/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default3]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default5]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default3]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default4]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default0]: successfully saved checkpoint at iteration 2500 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default1]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default6]:[2022-10-06 18:06:27,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2500 is ready now! +[default7]: iteration 2501/ 6200 | consumed samples: 2561024 | consumed tokens: 5244977152 | elapsed time per iteration (s): 53.26 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.813959E+00 | loss scale: 2048.0 | grad norm: 9.773 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.225 | TFLOPs: 5.86 | +[default7]: iteration 2502/ 6200 | consumed samples: 2562048 | consumed tokens: 5247074304 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801433E+00 | loss scale: 2048.0 | grad norm: 7.048 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.919 | TFLOPs: 42.34 | +[default7]: iteration 2503/ 6200 | consumed samples: 2563072 | consumed tokens: 5249171456 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.800150E+00 | loss scale: 2048.0 | grad norm: 5.700 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.006 | TFLOPs: 42.37 | +[default7]: iteration 2504/ 6200 | consumed samples: 2564096 | consumed tokens: 5251268608 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.806448E+00 | loss scale: 2048.0 | grad norm: 8.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.106 | TFLOPs: 42.40 | +[default7]: iteration 2505/ 6200 | consumed samples: 2565120 | consumed tokens: 5253365760 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.784163E+00 | loss scale: 2048.0 | grad norm: 6.072 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.879 | TFLOPs: 42.33 | +[default7]: iteration 2506/ 6200 | consumed samples: 2566144 | consumed tokens: 5255462912 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.835867E+00 | loss scale: 2048.0 | grad norm: 6.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.828 | TFLOPs: 42.32 | +[default7]: iteration 2507/ 6200 | consumed samples: 2567168 | consumed tokens: 5257560064 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.815432E+00 | loss scale: 2048.0 | grad norm: 5.651 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.190 | TFLOPs: 42.43 | +[default7]: iteration 2508/ 6200 | consumed samples: 2568192 | consumed tokens: 5259657216 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.820940E+00 | loss scale: 2048.0 | grad norm: 4.945 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.991 | TFLOPs: 42.37 | +[default7]: iteration 2509/ 6200 | consumed samples: 2569216 | consumed tokens: 5261754368 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.798158E+00 | loss scale: 2048.0 | grad norm: 6.068 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.996 | TFLOPs: 42.37 | +[default7]: iteration 2510/ 6200 | consumed samples: 2570240 | consumed tokens: 5263851520 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.773580E+00 | loss scale: 2048.0 | grad norm: 5.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.448 | TFLOPs: 42.20 | +[default7]: iteration 2511/ 6200 | consumed samples: 2571264 | consumed tokens: 5265948672 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795249E+00 | loss scale: 2048.0 | grad norm: 5.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.778 | TFLOPs: 42.30 | +[default7]: iteration 2512/ 6200 | consumed samples: 2572288 | consumed tokens: 5268045824 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807333E+00 | loss scale: 2048.0 | grad norm: 5.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.562 | TFLOPs: 42.23 | +[default7]: iteration 2513/ 6200 | consumed samples: 2573312 | consumed tokens: 5270142976 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.814491E+00 | loss scale: 2048.0 | grad norm: 6.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.797 | TFLOPs: 42.31 | +[default7]: iteration 2514/ 6200 | consumed samples: 2574336 | consumed tokens: 5272240128 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.789529E+00 | loss scale: 2048.0 | grad norm: 5.710 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.611 | TFLOPs: 42.25 | +[default7]: iteration 2515/ 6200 | consumed samples: 2575360 | consumed tokens: 5274337280 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782570E+00 | loss scale: 2048.0 | grad norm: 7.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.654 | TFLOPs: 42.26 | +[default7]: iteration 2516/ 6200 | consumed samples: 2576384 | consumed tokens: 5276434432 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770369E+00 | loss scale: 2048.0 | grad norm: 6.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.560 | TFLOPs: 42.23 | +[default7]: iteration 2517/ 6200 | consumed samples: 2577408 | consumed tokens: 5278531584 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.805807E+00 | loss scale: 2048.0 | grad norm: 5.874 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.590 | TFLOPs: 42.24 | +[default7]: iteration 2518/ 6200 | consumed samples: 2578432 | consumed tokens: 5280628736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.790734E+00 | loss scale: 2048.0 | grad norm: 6.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.592 | TFLOPs: 42.24 | +[default7]: iteration 2519/ 6200 | consumed samples: 2579456 | consumed tokens: 5282725888 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.804348E+00 | loss scale: 2048.0 | grad norm: 6.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.651 | TFLOPs: 42.26 | +[default7]: iteration 2520/ 6200 | consumed samples: 2580480 | consumed tokens: 5284823040 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770945E+00 | loss scale: 2048.0 | grad norm: 5.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.643 | TFLOPs: 42.26 | +[default7]: iteration 2521/ 6200 | consumed samples: 2581504 | consumed tokens: 5286920192 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795613E+00 | loss scale: 2048.0 | grad norm: 5.752 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.223 | TFLOPs: 42.13 | +[default7]: iteration 2522/ 6200 | consumed samples: 2582528 | consumed tokens: 5289017344 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.777728E+00 | loss scale: 2048.0 | grad norm: 5.686 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.600 | TFLOPs: 42.25 | +[default7]: iteration 2523/ 6200 | consumed samples: 2583552 | consumed tokens: 5291114496 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801655E+00 | loss scale: 2048.0 | grad norm: 5.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.707 | TFLOPs: 42.28 | +[default7]: iteration 2524/ 6200 | consumed samples: 2584576 | consumed tokens: 5293211648 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.806190E+00 | loss scale: 2048.0 | grad norm: 5.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.666 | TFLOPs: 42.27 | +[default7]: iteration 2525/ 6200 | consumed samples: 2585600 | consumed tokens: 5295308800 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.806758E+00 | loss scale: 2048.0 | grad norm: 5.996 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.633 | TFLOPs: 42.26 | +[default7]: iteration 2526/ 6200 | consumed samples: 2586624 | consumed tokens: 5297405952 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799682E+00 | loss scale: 2048.0 | grad norm: 6.040 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.767 | TFLOPs: 42.30 | +[default7]: iteration 2527/ 6200 | consumed samples: 2587648 | consumed tokens: 5299503104 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.810119E+00 | loss scale: 2048.0 | grad norm: 5.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.650 | TFLOPs: 42.26 | +[default7]: iteration 2528/ 6200 | consumed samples: 2588672 | consumed tokens: 5301600256 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801398E+00 | loss scale: 2048.0 | grad norm: 6.693 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.705 | TFLOPs: 41.97 | +[default7]: iteration 2529/ 6200 | consumed samples: 2589696 | consumed tokens: 5303697408 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.823516E+00 | loss scale: 2048.0 | grad norm: 6.858 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.808 | TFLOPs: 42.31 | +[default7]: iteration 2530/ 6200 | consumed samples: 2590720 | consumed tokens: 5305794560 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.822693E+00 | loss scale: 2048.0 | grad norm: 6.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.702 | TFLOPs: 42.28 | +[default7]: iteration 2531/ 6200 | consumed samples: 2591744 | consumed tokens: 5307891712 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.800454E+00 | loss scale: 2048.0 | grad norm: 4.974 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.512 | TFLOPs: 42.22 | +[default7]: iteration 2532/ 6200 | consumed samples: 2592768 | consumed tokens: 5309988864 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.829672E+00 | loss scale: 2048.0 | grad norm: 5.867 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.815 | TFLOPs: 42.31 | +[default7]: iteration 2533/ 6200 | consumed samples: 2593792 | consumed tokens: 5312086016 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.819247E+00 | loss scale: 2048.0 | grad norm: 6.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.829 | TFLOPs: 42.32 | +[default7]: iteration 2534/ 6200 | consumed samples: 2594816 | consumed tokens: 5314183168 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.804681E+00 | loss scale: 2048.0 | grad norm: 6.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.741 | TFLOPs: 42.29 | +[default7]: iteration 2535/ 6200 | consumed samples: 2595840 | consumed tokens: 5316280320 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.809775E+00 | loss scale: 2048.0 | grad norm: 7.027 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.671 | TFLOPs: 42.27 | +[default7]: iteration 2536/ 6200 | consumed samples: 2596864 | consumed tokens: 5318377472 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794222E+00 | loss scale: 2048.0 | grad norm: 6.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.854 | TFLOPs: 42.32 | +[default7]: iteration 2537/ 6200 | consumed samples: 2597888 | consumed tokens: 5320474624 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.811808E+00 | loss scale: 2048.0 | grad norm: 6.866 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.615 | TFLOPs: 42.25 | +[default7]: iteration 2538/ 6200 | consumed samples: 2598912 | consumed tokens: 5322571776 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793394E+00 | loss scale: 2048.0 | grad norm: 6.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.509 | TFLOPs: 42.22 | +[default7]: iteration 2539/ 6200 | consumed samples: 2599936 | consumed tokens: 5324668928 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.784347E+00 | loss scale: 2048.0 | grad norm: 5.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.527 | TFLOPs: 42.22 | +[default7]: iteration 2540/ 6200 | consumed samples: 2600960 | consumed tokens: 5326766080 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.788820E+00 | loss scale: 2048.0 | grad norm: 4.867 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.568 | TFLOPs: 42.24 | +[default7]: iteration 2541/ 6200 | consumed samples: 2601984 | consumed tokens: 5328863232 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781564E+00 | loss scale: 2048.0 | grad norm: 6.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.600 | TFLOPs: 42.25 | +[default7]: iteration 2542/ 6200 | consumed samples: 2603008 | consumed tokens: 5330960384 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.816319E+00 | loss scale: 2048.0 | grad norm: 5.976 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.697 | TFLOPs: 42.28 | +[default7]: iteration 2543/ 6200 | consumed samples: 2604032 | consumed tokens: 5333057536 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794153E+00 | loss scale: 2048.0 | grad norm: 7.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.550 | TFLOPs: 42.23 | +[default7]: iteration 2544/ 6200 | consumed samples: 2605056 | consumed tokens: 5335154688 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799257E+00 | loss scale: 2048.0 | grad norm: 4.849 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.511 | TFLOPs: 42.22 | +[default7]: iteration 2545/ 6200 | consumed samples: 2606080 | consumed tokens: 5337251840 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.811862E+00 | loss scale: 2048.0 | grad norm: 7.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.656 | TFLOPs: 42.26 | +[default7]: iteration 2546/ 6200 | consumed samples: 2607104 | consumed tokens: 5339348992 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.797761E+00 | loss scale: 2048.0 | grad norm: 6.573 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.612 | TFLOPs: 42.25 | +[default7]: iteration 2547/ 6200 | consumed samples: 2608128 | consumed tokens: 5341446144 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801932E+00 | loss scale: 2048.0 | grad norm: 5.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.455 | TFLOPs: 42.20 | +[default7]: iteration 2548/ 6200 | consumed samples: 2609152 | consumed tokens: 5343543296 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.809779E+00 | loss scale: 2048.0 | grad norm: 5.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.775 | TFLOPs: 42.30 | +[default7]: iteration 2549/ 6200 | consumed samples: 2610176 | consumed tokens: 5345640448 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.824107E+00 | loss scale: 2048.0 | grad norm: 6.696 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.754 | TFLOPs: 42.29 | +[default7]: iteration 2550/ 6200 | consumed samples: 2611200 | consumed tokens: 5347737600 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.783470E+00 | loss scale: 2048.0 | grad norm: 5.968 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.777 | TFLOPs: 42.30 | +[default7]: iteration 2551/ 6200 | consumed samples: 2612224 | consumed tokens: 5349834752 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.817630E+00 | loss scale: 2048.0 | grad norm: 5.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.744 | TFLOPs: 42.29 | +[default7]: iteration 2552/ 6200 | consumed samples: 2613248 | consumed tokens: 5351931904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.802678E+00 | loss scale: 2048.0 | grad norm: 5.641 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.668 | TFLOPs: 42.27 | +[default7]: iteration 2553/ 6200 | consumed samples: 2614272 | consumed tokens: 5354029056 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.792470E+00 | loss scale: 2048.0 | grad norm: 5.629 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.670 | TFLOPs: 42.27 | +[default7]: iteration 2554/ 6200 | consumed samples: 2615296 | consumed tokens: 5356126208 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793584E+00 | loss scale: 2048.0 | grad norm: 5.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.604 | TFLOPs: 42.25 | +[default7]: iteration 2555/ 6200 | consumed samples: 2616320 | consumed tokens: 5358223360 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782962E+00 | loss scale: 2048.0 | grad norm: 4.987 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.733 | TFLOPs: 42.29 | +[default7]: iteration 2556/ 6200 | consumed samples: 2617344 | consumed tokens: 5360320512 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.811181E+00 | loss scale: 2048.0 | grad norm: 6.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.640 | TFLOPs: 42.26 | +[default7]: iteration 2557/ 6200 | consumed samples: 2618368 | consumed tokens: 5362417664 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807172E+00 | loss scale: 2048.0 | grad norm: 6.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.748 | TFLOPs: 41.99 | +[default7]: iteration 2558/ 6200 | consumed samples: 2619392 | consumed tokens: 5364514816 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.812526E+00 | loss scale: 2048.0 | grad norm: 6.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.840 | TFLOPs: 42.32 | +[default7]: iteration 2559/ 6200 | consumed samples: 2620416 | consumed tokens: 5366611968 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.792834E+00 | loss scale: 2048.0 | grad norm: 7.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.717 | TFLOPs: 42.28 | +[default7]: iteration 2560/ 6200 | consumed samples: 2621440 | consumed tokens: 5368709120 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.797341E+00 | loss scale: 2048.0 | grad norm: 5.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.831 | TFLOPs: 42.32 | +[default7]: iteration 2561/ 6200 | consumed samples: 2622464 | consumed tokens: 5370806272 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.821935E+00 | loss scale: 2048.0 | grad norm: 7.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.817 | TFLOPs: 42.01 | +[default7]: iteration 2562/ 6200 | consumed samples: 2623488 | consumed tokens: 5372903424 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.811151E+00 | loss scale: 2048.0 | grad norm: 7.076 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.787 | TFLOPs: 42.30 | +[default7]: iteration 2563/ 6200 | consumed samples: 2624512 | consumed tokens: 5375000576 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.785591E+00 | loss scale: 2048.0 | grad norm: 5.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.776 | TFLOPs: 42.30 | +[default7]: iteration 2564/ 6200 | consumed samples: 2625536 | consumed tokens: 5377097728 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.805050E+00 | loss scale: 2048.0 | grad norm: 6.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.975 | TFLOPs: 42.36 | +[default7]: iteration 2565/ 6200 | consumed samples: 2626560 | consumed tokens: 5379194880 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.798307E+00 | loss scale: 2048.0 | grad norm: 5.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.813 | TFLOPs: 42.31 | +[default7]: iteration 2566/ 6200 | consumed samples: 2627584 | consumed tokens: 5381292032 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.789370E+00 | loss scale: 2048.0 | grad norm: 5.792 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.799 | TFLOPs: 42.00 | +[default7]: iteration 2567/ 6200 | consumed samples: 2628608 | consumed tokens: 5383389184 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799249E+00 | loss scale: 2048.0 | grad norm: 5.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.948 | TFLOPs: 42.35 | +[default7]: iteration 2568/ 6200 | consumed samples: 2629632 | consumed tokens: 5385486336 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.814899E+00 | loss scale: 2048.0 | grad norm: 6.062 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.887 | TFLOPs: 42.33 | +[default7]: iteration 2569/ 6200 | consumed samples: 2630656 | consumed tokens: 5387583488 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.767096E+00 | loss scale: 2048.0 | grad norm: 6.008 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.927 | TFLOPs: 42.35 | +[default7]: iteration 2570/ 6200 | consumed samples: 2631680 | consumed tokens: 5389680640 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.800631E+00 | loss scale: 2048.0 | grad norm: 6.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.804 | TFLOPs: 42.31 | +[default7]: iteration 2571/ 6200 | consumed samples: 2632704 | consumed tokens: 5391777792 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794102E+00 | loss scale: 2048.0 | grad norm: 5.692 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.940 | TFLOPs: 42.35 | +[default7]: iteration 2572/ 6200 | consumed samples: 2633728 | consumed tokens: 5393874944 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.803092E+00 | loss scale: 2048.0 | grad norm: 5.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.828 | TFLOPs: 42.32 | +[default7]: iteration 2573/ 6200 | consumed samples: 2634752 | consumed tokens: 5395972096 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.809864E+00 | loss scale: 2048.0 | grad norm: 5.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.888 | TFLOPs: 42.33 | +[default7]: iteration 2574/ 6200 | consumed samples: 2635776 | consumed tokens: 5398069248 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.823216E+00 | loss scale: 2048.0 | grad norm: 6.851 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.784 | TFLOPs: 42.30 | +[default7]: iteration 2575/ 6200 | consumed samples: 2636800 | consumed tokens: 5400166400 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.796126E+00 | loss scale: 2048.0 | grad norm: 7.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.832 | TFLOPs: 42.32 | +[default7]: iteration 2576/ 6200 | consumed samples: 2637824 | consumed tokens: 5402263552 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791700E+00 | loss scale: 2048.0 | grad norm: 5.771 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.823 | TFLOPs: 42.31 | +[default7]: iteration 2577/ 6200 | consumed samples: 2638848 | consumed tokens: 5404360704 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.797931E+00 | loss scale: 2048.0 | grad norm: 6.643 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.863 | TFLOPs: 42.33 | +[default7]: iteration 2578/ 6200 | consumed samples: 2639872 | consumed tokens: 5406457856 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791548E+00 | loss scale: 2048.0 | grad norm: 7.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.894 | TFLOPs: 42.34 | +[default7]: iteration 2579/ 6200 | consumed samples: 2640896 | consumed tokens: 5408555008 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.810714E+00 | loss scale: 2048.0 | grad norm: 6.982 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.879 | TFLOPs: 42.33 | +[default7]: iteration 2580/ 6200 | consumed samples: 2641920 | consumed tokens: 5410652160 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.827956E+00 | loss scale: 2048.0 | grad norm: 6.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.681 | TFLOPs: 42.27 | +[default7]: iteration 2581/ 6200 | consumed samples: 2642944 | consumed tokens: 5412749312 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794274E+00 | loss scale: 2048.0 | grad norm: 5.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.916 | TFLOPs: 42.34 | +[default7]: iteration 2582/ 6200 | consumed samples: 2643968 | consumed tokens: 5414846464 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.804722E+00 | loss scale: 2048.0 | grad norm: 6.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.880 | TFLOPs: 42.33 | +[default7]: iteration 2583/ 6200 | consumed samples: 2644992 | consumed tokens: 5416943616 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.785062E+00 | loss scale: 2048.0 | grad norm: 6.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.797 | TFLOPs: 42.31 | +[default7]: iteration 2584/ 6200 | consumed samples: 2646016 | consumed tokens: 5419040768 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.821672E+00 | loss scale: 2048.0 | grad norm: 6.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.554 | TFLOPs: 42.23 | +[default7]: iteration 2585/ 6200 | consumed samples: 2647040 | consumed tokens: 5421137920 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794883E+00 | loss scale: 2048.0 | grad norm: 5.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.820 | TFLOPs: 42.31 | +[default7]: iteration 2586/ 6200 | consumed samples: 2648064 | consumed tokens: 5423235072 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.825233E+00 | loss scale: 2048.0 | grad norm: 6.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.648 | TFLOPs: 42.26 | +[default7]: iteration 2587/ 6200 | consumed samples: 2649088 | consumed tokens: 5425332224 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786033E+00 | loss scale: 2048.0 | grad norm: 5.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.948 | TFLOPs: 42.35 | +[default7]: iteration 2588/ 6200 | consumed samples: 2650112 | consumed tokens: 5427429376 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794900E+00 | loss scale: 2048.0 | grad norm: 6.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.101 | TFLOPs: 42.40 | +[default7]: iteration 2589/ 6200 | consumed samples: 2651136 | consumed tokens: 5429526528 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808115E+00 | loss scale: 2048.0 | grad norm: 6.076 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.917 | TFLOPs: 42.34 | +[default7]: iteration 2590/ 6200 | consumed samples: 2652160 | consumed tokens: 5431623680 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794743E+00 | loss scale: 2048.0 | grad norm: 6.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.173 | TFLOPs: 42.42 | +[default7]: iteration 2591/ 6200 | consumed samples: 2653184 | consumed tokens: 5433720832 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793910E+00 | loss scale: 2048.0 | grad norm: 7.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.670 | TFLOPs: 42.27 | +[default7]: iteration 2592/ 6200 | consumed samples: 2654208 | consumed tokens: 5435817984 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.816260E+00 | loss scale: 2048.0 | grad norm: 9.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.127 | TFLOPs: 42.41 | +[default7]: iteration 2593/ 6200 | consumed samples: 2655232 | consumed tokens: 5437915136 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770158E+00 | loss scale: 2048.0 | grad norm: 7.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.208 | TFLOPs: 42.43 | +[default7]: iteration 2594/ 6200 | consumed samples: 2656256 | consumed tokens: 5440012288 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795280E+00 | loss scale: 2048.0 | grad norm: 5.688 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.088 | TFLOPs: 42.40 | +[default7]: iteration 2595/ 6200 | consumed samples: 2657280 | consumed tokens: 5442109440 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.784191E+00 | loss scale: 2048.0 | grad norm: 6.776 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.131 | TFLOPs: 42.10 | +[default7]: iteration 2596/ 6200 | consumed samples: 2658304 | consumed tokens: 5444206592 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.773723E+00 | loss scale: 2048.0 | grad norm: 7.778 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.093 | TFLOPs: 42.40 | +[default7]: iteration 2597/ 6200 | consumed samples: 2659328 | consumed tokens: 5446303744 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791591E+00 | loss scale: 2048.0 | grad norm: 5.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.857 | TFLOPs: 42.32 | +[default7]: iteration 2598/ 6200 | consumed samples: 2660352 | consumed tokens: 5448400896 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.816641E+00 | loss scale: 2048.0 | grad norm: 5.002 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.770 | TFLOPs: 42.30 | +[default7]: iteration 2599/ 6200 | consumed samples: 2661376 | consumed tokens: 5450498048 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.809158E+00 | loss scale: 2048.0 | grad norm: 6.846 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.709 | TFLOPs: 41.97 | +[default7]: iteration 2600/ 6200 | consumed samples: 2662400 | consumed tokens: 5452595200 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.779724E+00 | loss scale: 2048.0 | grad norm: 6.005 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.731 | TFLOPs: 42.29 | +[default7]: iteration 2601/ 6200 | consumed samples: 2663424 | consumed tokens: 5454692352 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782916E+00 | loss scale: 2048.0 | grad norm: 5.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.579 | TFLOPs: 42.24 | +[default7]: iteration 2602/ 6200 | consumed samples: 2664448 | consumed tokens: 5456789504 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.783840E+00 | loss scale: 2048.0 | grad norm: 6.725 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.052 | TFLOPs: 42.38 | +[default7]: iteration 2603/ 6200 | consumed samples: 2665472 | consumed tokens: 5458886656 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781614E+00 | loss scale: 2048.0 | grad norm: 5.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.183 | TFLOPs: 42.42 | +[default7]: iteration 2604/ 6200 | consumed samples: 2666496 | consumed tokens: 5460983808 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770583E+00 | loss scale: 2048.0 | grad norm: 6.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.932 | TFLOPs: 42.35 | +[default7]: iteration 2605/ 6200 | consumed samples: 2667520 | consumed tokens: 5463080960 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756579E+00 | loss scale: 2048.0 | grad norm: 5.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.193 | TFLOPs: 42.43 | +[default7]: iteration 2606/ 6200 | consumed samples: 2668544 | consumed tokens: 5465178112 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.788398E+00 | loss scale: 2048.0 | grad norm: 6.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.212 | TFLOPs: 42.43 | +[default7]: iteration 2607/ 6200 | consumed samples: 2669568 | consumed tokens: 5467275264 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786329E+00 | loss scale: 2048.0 | grad norm: 5.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.250 | TFLOPs: 42.44 | +[default7]: iteration 2608/ 6200 | consumed samples: 2670592 | consumed tokens: 5469372416 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791002E+00 | loss scale: 2048.0 | grad norm: 6.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.066 | TFLOPs: 42.39 | +[default7]: iteration 2609/ 6200 | consumed samples: 2671616 | consumed tokens: 5471469568 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786877E+00 | loss scale: 2048.0 | grad norm: 5.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.112 | TFLOPs: 42.40 | +[default7]: iteration 2610/ 6200 | consumed samples: 2672640 | consumed tokens: 5473566720 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808603E+00 | loss scale: 2048.0 | grad norm: 6.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.020 | TFLOPs: 42.37 | +[default7]: iteration 2611/ 6200 | consumed samples: 2673664 | consumed tokens: 5475663872 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808706E+00 | loss scale: 2048.0 | grad norm: 5.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.200 | TFLOPs: 42.43 | +[default7]: iteration 2612/ 6200 | consumed samples: 2674688 | consumed tokens: 5477761024 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.816851E+00 | loss scale: 2048.0 | grad norm: 5.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.110 | TFLOPs: 42.40 | +[default7]: iteration 2613/ 6200 | consumed samples: 2675712 | consumed tokens: 5479858176 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793468E+00 | loss scale: 2048.0 | grad norm: 5.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.116 | TFLOPs: 42.40 | +[default7]: iteration 2614/ 6200 | consumed samples: 2676736 | consumed tokens: 5481955328 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.831722E+00 | loss scale: 2048.0 | grad norm: 5.774 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.091 | TFLOPs: 42.40 | +[default7]: iteration 2615/ 6200 | consumed samples: 2677760 | consumed tokens: 5484052480 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.769495E+00 | loss scale: 2048.0 | grad norm: 5.011 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.033 | TFLOPs: 42.38 | +[default7]: iteration 2616/ 6200 | consumed samples: 2678784 | consumed tokens: 5486149632 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794508E+00 | loss scale: 2048.0 | grad norm: 5.768 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.987 | TFLOPs: 42.36 | +[default7]: iteration 2617/ 6200 | consumed samples: 2679808 | consumed tokens: 5488246784 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.780991E+00 | loss scale: 2048.0 | grad norm: 5.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.123 | TFLOPs: 42.41 | +[default7]: iteration 2618/ 6200 | consumed samples: 2680832 | consumed tokens: 5490343936 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.805267E+00 | loss scale: 2048.0 | grad norm: 6.035 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.040 | TFLOPs: 42.38 | +[default7]: iteration 2619/ 6200 | consumed samples: 2681856 | consumed tokens: 5492441088 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794052E+00 | loss scale: 2048.0 | grad norm: 7.879 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.181 | TFLOPs: 42.42 | +[default7]: iteration 2620/ 6200 | consumed samples: 2682880 | consumed tokens: 5494538240 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781791E+00 | loss scale: 2048.0 | grad norm: 5.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.136 | TFLOPs: 42.41 | +[default7]: iteration 2621/ 6200 | consumed samples: 2683904 | consumed tokens: 5496635392 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808306E+00 | loss scale: 2048.0 | grad norm: 5.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.995 | TFLOPs: 42.37 | +[default7]: iteration 2622/ 6200 | consumed samples: 2684928 | consumed tokens: 5498732544 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794134E+00 | loss scale: 2048.0 | grad norm: 6.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.469 | TFLOPs: 42.21 | +[default7]: iteration 2623/ 6200 | consumed samples: 2685952 | consumed tokens: 5500829696 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.822255E+00 | loss scale: 2048.0 | grad norm: 5.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.235 | TFLOPs: 42.44 | +[default7]: iteration 2624/ 6200 | consumed samples: 2686976 | consumed tokens: 5502926848 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.797589E+00 | loss scale: 2048.0 | grad norm: 8.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.874 | TFLOPs: 42.33 | +[default7]: iteration 2625/ 6200 | consumed samples: 2688000 | consumed tokens: 5505024000 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770151E+00 | loss scale: 2048.0 | grad norm: 5.773 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.119 | TFLOPs: 42.40 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 2625 | lm loss value: 3.519416E+00 | lm loss PPL: 3.376471E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 2625 | lm loss value: 1.689623E+00 | lm loss PPL: 5.417435E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 2626/ 6200 | consumed samples: 2689024 | consumed tokens: 5507121152 | elapsed time per iteration (s): 51.78 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.777130E+00 | loss scale: 2048.0 | grad norm: 5.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.774 | TFLOPs: 6.03 | +[default7]: iteration 2627/ 6200 | consumed samples: 2690048 | consumed tokens: 5509218304 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799117E+00 | loss scale: 2048.0 | grad norm: 6.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.801 | TFLOPs: 42.31 | +[default7]: iteration 2628/ 6200 | consumed samples: 2691072 | consumed tokens: 5511315456 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.810719E+00 | loss scale: 2048.0 | grad norm: 5.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.676 | TFLOPs: 42.27 | +[default7]: iteration 2629/ 6200 | consumed samples: 2692096 | consumed tokens: 5513412608 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.785395E+00 | loss scale: 2048.0 | grad norm: 5.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.625 | TFLOPs: 42.25 | +[default7]: iteration 2630/ 6200 | consumed samples: 2693120 | consumed tokens: 5515509760 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.789546E+00 | loss scale: 2048.0 | grad norm: 7.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.819 | TFLOPs: 42.31 | +[default7]: iteration 2631/ 6200 | consumed samples: 2694144 | consumed tokens: 5517606912 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799178E+00 | loss scale: 2048.0 | grad norm: 7.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.484 | TFLOPs: 42.21 | +[default7]: iteration 2632/ 6200 | consumed samples: 2695168 | consumed tokens: 5519704064 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.789309E+00 | loss scale: 2048.0 | grad norm: 5.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.855 | TFLOPs: 42.32 | +[default7]: iteration 2633/ 6200 | consumed samples: 2696192 | consumed tokens: 5521801216 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.813009E+00 | loss scale: 2048.0 | grad norm: 6.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.717 | TFLOPs: 42.28 | +[default7]: iteration 2634/ 6200 | consumed samples: 2697216 | consumed tokens: 5523898368 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.796712E+00 | loss scale: 2048.0 | grad norm: 5.972 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.634 | TFLOPs: 42.26 | +[default7]: iteration 2635/ 6200 | consumed samples: 2698240 | consumed tokens: 5525995520 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.802435E+00 | loss scale: 2048.0 | grad norm: 6.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.569 | TFLOPs: 42.24 | +[default7]: iteration 2636/ 6200 | consumed samples: 2699264 | consumed tokens: 5528092672 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.789975E+00 | loss scale: 2048.0 | grad norm: 6.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.648 | TFLOPs: 42.26 | +[default7]: iteration 2637/ 6200 | consumed samples: 2700288 | consumed tokens: 5530189824 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.783105E+00 | loss scale: 2048.0 | grad norm: 6.790 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.595 | TFLOPs: 42.24 | +[default7]: iteration 2638/ 6200 | consumed samples: 2701312 | consumed tokens: 5532286976 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.779623E+00 | loss scale: 2048.0 | grad norm: 5.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.545 | TFLOPs: 42.23 | +[default7]: iteration 2639/ 6200 | consumed samples: 2702336 | consumed tokens: 5534384128 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807628E+00 | loss scale: 2048.0 | grad norm: 5.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.739 | TFLOPs: 42.29 | +[default7]: iteration 2640/ 6200 | consumed samples: 2703360 | consumed tokens: 5536481280 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772350E+00 | loss scale: 2048.0 | grad norm: 5.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.419 | TFLOPs: 42.19 | +[default7]: iteration 2641/ 6200 | consumed samples: 2704384 | consumed tokens: 5538578432 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799087E+00 | loss scale: 2048.0 | grad norm: 5.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.666 | TFLOPs: 42.27 | +[default7]: iteration 2642/ 6200 | consumed samples: 2705408 | consumed tokens: 5540675584 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.813692E+00 | loss scale: 2048.0 | grad norm: 5.768 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.712 | TFLOPs: 42.28 | +[default7]: iteration 2643/ 6200 | consumed samples: 2706432 | consumed tokens: 5542772736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.816331E+00 | loss scale: 2048.0 | grad norm: 5.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.517 | TFLOPs: 42.22 | +[default7]: iteration 2644/ 6200 | consumed samples: 2707456 | consumed tokens: 5544869888 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808652E+00 | loss scale: 2048.0 | grad norm: 6.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.986 | TFLOPs: 42.36 | +[default7]: iteration 2645/ 6200 | consumed samples: 2708480 | consumed tokens: 5546967040 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799580E+00 | loss scale: 2048.0 | grad norm: 5.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.379 | TFLOPs: 42.18 | +[default7]: iteration 2646/ 6200 | consumed samples: 2709504 | consumed tokens: 5549064192 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775060E+00 | loss scale: 2048.0 | grad norm: 5.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.768 | TFLOPs: 42.30 | +[default7]: iteration 2647/ 6200 | consumed samples: 2710528 | consumed tokens: 5551161344 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.784864E+00 | loss scale: 2048.0 | grad norm: 7.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.837 | TFLOPs: 42.32 | +[default7]: iteration 2648/ 6200 | consumed samples: 2711552 | consumed tokens: 5553258496 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791681E+00 | loss scale: 2048.0 | grad norm: 7.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.674 | TFLOPs: 42.27 | +[default7]: iteration 2649/ 6200 | consumed samples: 2712576 | consumed tokens: 5555355648 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793548E+00 | loss scale: 2048.0 | grad norm: 6.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.570 | TFLOPs: 42.24 | +[default7]: iteration 2650/ 6200 | consumed samples: 2713600 | consumed tokens: 5557452800 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782533E+00 | loss scale: 2048.0 | grad norm: 6.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.695 | TFLOPs: 42.28 | +[default7]: iteration 2651/ 6200 | consumed samples: 2714624 | consumed tokens: 5559549952 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.784605E+00 | loss scale: 2048.0 | grad norm: 5.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.811 | TFLOPs: 42.31 | +[default7]: iteration 2652/ 6200 | consumed samples: 2715648 | consumed tokens: 5561647104 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.785277E+00 | loss scale: 2048.0 | grad norm: 5.935 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.653 | TFLOPs: 42.26 | +[default7]: iteration 2653/ 6200 | consumed samples: 2716672 | consumed tokens: 5563744256 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791994E+00 | loss scale: 2048.0 | grad norm: 5.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.338 | TFLOPs: 42.17 | +[default7]: iteration 2654/ 6200 | consumed samples: 2717696 | consumed tokens: 5565841408 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786068E+00 | loss scale: 2048.0 | grad norm: 5.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.161 | TFLOPs: 42.11 | +[default7]: iteration 2655/ 6200 | consumed samples: 2718720 | consumed tokens: 5567938560 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795402E+00 | loss scale: 2048.0 | grad norm: 4.697 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.797 | TFLOPs: 42.31 | +[default7]: iteration 2656/ 6200 | consumed samples: 2719744 | consumed tokens: 5570035712 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.777624E+00 | loss scale: 2048.0 | grad norm: 5.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.767 | TFLOPs: 42.30 | +[default7]: iteration 2657/ 6200 | consumed samples: 2720768 | consumed tokens: 5572132864 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765425E+00 | loss scale: 2048.0 | grad norm: 5.793 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.570 | TFLOPs: 42.24 | +[default7]: iteration 2658/ 6200 | consumed samples: 2721792 | consumed tokens: 5574230016 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.792740E+00 | loss scale: 2048.0 | grad norm: 7.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.728 | TFLOPs: 42.29 | +[default7]: iteration 2659/ 6200 | consumed samples: 2722816 | consumed tokens: 5576327168 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775696E+00 | loss scale: 2048.0 | grad norm: 5.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.323 | TFLOPs: 42.16 | +[default7]: iteration 2660/ 6200 | consumed samples: 2723840 | consumed tokens: 5578424320 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.784929E+00 | loss scale: 2048.0 | grad norm: 6.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.489 | TFLOPs: 42.21 | +[default7]: iteration 2661/ 6200 | consumed samples: 2724864 | consumed tokens: 5580521472 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.779001E+00 | loss scale: 2048.0 | grad norm: 4.676 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.095 | TFLOPs: 42.09 | +[default7]: iteration 2662/ 6200 | consumed samples: 2725888 | consumed tokens: 5582618624 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801654E+00 | loss scale: 2048.0 | grad norm: 5.671 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.142 | TFLOPs: 42.11 | +[default7]: iteration 2663/ 6200 | consumed samples: 2726912 | consumed tokens: 5584715776 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.823434E+00 | loss scale: 2048.0 | grad norm: 6.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.371 | TFLOPs: 42.18 | +[default7]: iteration 2664/ 6200 | consumed samples: 2727936 | consumed tokens: 5586812928 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793189E+00 | loss scale: 2048.0 | grad norm: 6.569 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.431 | TFLOPs: 42.19 | +[default7]: iteration 2665/ 6200 | consumed samples: 2728960 | consumed tokens: 5588910080 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.755091E+00 | loss scale: 2048.0 | grad norm: 5.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.439 | TFLOPs: 42.20 | +[default7]: iteration 2666/ 6200 | consumed samples: 2729984 | consumed tokens: 5591007232 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786088E+00 | loss scale: 2048.0 | grad norm: 5.068 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.229 | TFLOPs: 42.13 | +[default7]: iteration 2667/ 6200 | consumed samples: 2731008 | consumed tokens: 5593104384 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791934E+00 | loss scale: 2048.0 | grad norm: 6.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.292 | TFLOPs: 42.15 | +[default7]: iteration 2668/ 6200 | consumed samples: 2732032 | consumed tokens: 5595201536 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794162E+00 | loss scale: 2048.0 | grad norm: 5.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.322 | TFLOPs: 42.16 | +[default7]: iteration 2669/ 6200 | consumed samples: 2733056 | consumed tokens: 5597298688 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.798551E+00 | loss scale: 2048.0 | grad norm: 5.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.178 | TFLOPs: 42.12 | +[default7]: iteration 2670/ 6200 | consumed samples: 2734080 | consumed tokens: 5599395840 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.806046E+00 | loss scale: 2048.0 | grad norm: 6.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.528 | TFLOPs: 42.22 | +[default7]: iteration 2671/ 6200 | consumed samples: 2735104 | consumed tokens: 5601492992 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.801033E+00 | loss scale: 2048.0 | grad norm: 6.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.282 | TFLOPs: 42.15 | +[default7]: iteration 2672/ 6200 | consumed samples: 2736128 | consumed tokens: 5603590144 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775296E+00 | loss scale: 2048.0 | grad norm: 5.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.553 | TFLOPs: 42.23 | +[default7]: iteration 2673/ 6200 | consumed samples: 2737152 | consumed tokens: 5605687296 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.798193E+00 | loss scale: 2048.0 | grad norm: 5.062 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.410 | TFLOPs: 42.19 | +[default7]: iteration 2674/ 6200 | consumed samples: 2738176 | consumed tokens: 5607784448 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.807680E+00 | loss scale: 2048.0 | grad norm: 5.585 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.366 | TFLOPs: 42.18 | +[default7]: iteration 2675/ 6200 | consumed samples: 2739200 | consumed tokens: 5609881600 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.806493E+00 | loss scale: 2048.0 | grad norm: 6.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.453 | TFLOPs: 42.20 | +[default7]: iteration 2676/ 6200 | consumed samples: 2740224 | consumed tokens: 5611978752 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.798765E+00 | loss scale: 2048.0 | grad norm: 5.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.663 | TFLOPs: 42.27 | +[default7]: iteration 2677/ 6200 | consumed samples: 2741248 | consumed tokens: 5614075904 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794041E+00 | loss scale: 2048.0 | grad norm: 6.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.628 | TFLOPs: 42.26 | +[default7]: iteration 2678/ 6200 | consumed samples: 2742272 | consumed tokens: 5616173056 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765592E+00 | loss scale: 2048.0 | grad norm: 6.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.605 | TFLOPs: 42.25 | +[default7]: iteration 2679/ 6200 | consumed samples: 2743296 | consumed tokens: 5618270208 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.798541E+00 | loss scale: 2048.0 | grad norm: 5.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.263 | TFLOPs: 42.14 | +[default7]: iteration 2680/ 6200 | consumed samples: 2744320 | consumed tokens: 5620367360 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761100E+00 | loss scale: 2048.0 | grad norm: 6.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.745 | TFLOPs: 42.29 | +[default7]: iteration 2681/ 6200 | consumed samples: 2745344 | consumed tokens: 5622464512 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.798355E+00 | loss scale: 2048.0 | grad norm: 6.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.311 | TFLOPs: 42.16 | +[default7]: iteration 2682/ 6200 | consumed samples: 2746368 | consumed tokens: 5624561664 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.792292E+00 | loss scale: 2048.0 | grad norm: 5.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.637 | TFLOPs: 42.26 | +[default7]: iteration 2683/ 6200 | consumed samples: 2747392 | consumed tokens: 5626658816 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791081E+00 | loss scale: 2048.0 | grad norm: 5.869 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.361 | TFLOPs: 42.17 | +[default7]: iteration 2684/ 6200 | consumed samples: 2748416 | consumed tokens: 5628755968 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.790128E+00 | loss scale: 2048.0 | grad norm: 5.953 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.313 | TFLOPs: 42.16 | +[default7]: iteration 2685/ 6200 | consumed samples: 2749440 | consumed tokens: 5630853120 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.819005E+00 | loss scale: 2048.0 | grad norm: 5.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.403 | TFLOPs: 42.19 | +[default7]: iteration 2686/ 6200 | consumed samples: 2750464 | consumed tokens: 5632950272 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.768800E+00 | loss scale: 2048.0 | grad norm: 5.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.465 | TFLOPs: 42.21 | +[default7]: iteration 2687/ 6200 | consumed samples: 2751488 | consumed tokens: 5635047424 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.779379E+00 | loss scale: 2048.0 | grad norm: 5.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.394 | TFLOPs: 42.18 | +[default7]: iteration 2688/ 6200 | consumed samples: 2752512 | consumed tokens: 5637144576 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.777638E+00 | loss scale: 2048.0 | grad norm: 5.928 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.637 | TFLOPs: 42.26 | +[default7]: iteration 2689/ 6200 | consumed samples: 2753536 | consumed tokens: 5639241728 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.774358E+00 | loss scale: 2048.0 | grad norm: 5.594 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.718 | TFLOPs: 42.28 | +[default7]: iteration 2690/ 6200 | consumed samples: 2754560 | consumed tokens: 5641338880 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.798558E+00 | loss scale: 2048.0 | grad norm: 5.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.576 | TFLOPs: 42.24 | +[default7]: iteration 2691/ 6200 | consumed samples: 2755584 | consumed tokens: 5643436032 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799796E+00 | loss scale: 2048.0 | grad norm: 6.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.213 | TFLOPs: 42.13 | +[default7]: iteration 2692/ 6200 | consumed samples: 2756608 | consumed tokens: 5645533184 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.800836E+00 | loss scale: 2048.0 | grad norm: 6.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.299 | TFLOPs: 42.15 | +[default7]: iteration 2693/ 6200 | consumed samples: 2757632 | consumed tokens: 5647630336 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.797223E+00 | loss scale: 2048.0 | grad norm: 5.657 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.301 | TFLOPs: 42.16 | +[default7]: iteration 2694/ 6200 | consumed samples: 2758656 | consumed tokens: 5649727488 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.776255E+00 | loss scale: 2048.0 | grad norm: 5.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.291 | TFLOPs: 42.15 | +[default7]: iteration 2695/ 6200 | consumed samples: 2759680 | consumed tokens: 5651824640 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.787850E+00 | loss scale: 2048.0 | grad norm: 5.952 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.538 | TFLOPs: 42.23 | +[default7]: iteration 2696/ 6200 | consumed samples: 2760704 | consumed tokens: 5653921792 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778019E+00 | loss scale: 2048.0 | grad norm: 5.894 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.511 | TFLOPs: 42.22 | +[default7]: iteration 2697/ 6200 | consumed samples: 2761728 | consumed tokens: 5656018944 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786172E+00 | loss scale: 2048.0 | grad norm: 5.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.639 | TFLOPs: 42.26 | +[default7]: iteration 2698/ 6200 | consumed samples: 2762752 | consumed tokens: 5658116096 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.771041E+00 | loss scale: 2048.0 | grad norm: 6.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.561 | TFLOPs: 42.23 | +[default7]: iteration 2699/ 6200 | consumed samples: 2763776 | consumed tokens: 5660213248 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793965E+00 | loss scale: 2048.0 | grad norm: 7.811 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.464 | TFLOPs: 42.21 | +[default7]: iteration 2700/ 6200 | consumed samples: 2764800 | consumed tokens: 5662310400 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.779049E+00 | loss scale: 2048.0 | grad norm: 5.671 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.793 | TFLOPs: 42.31 | +[default7]: iteration 2701/ 6200 | consumed samples: 2765824 | consumed tokens: 5664407552 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.797242E+00 | loss scale: 2048.0 | grad norm: 5.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.570 | TFLOPs: 42.24 | +[default7]: iteration 2702/ 6200 | consumed samples: 2766848 | consumed tokens: 5666504704 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.820805E+00 | loss scale: 2048.0 | grad norm: 7.012 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.483 | TFLOPs: 42.21 | +[default7]: iteration 2703/ 6200 | consumed samples: 2767872 | consumed tokens: 5668601856 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.771908E+00 | loss scale: 2048.0 | grad norm: 5.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.436 | TFLOPs: 42.20 | +[default7]: iteration 2704/ 6200 | consumed samples: 2768896 | consumed tokens: 5670699008 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786280E+00 | loss scale: 2048.0 | grad norm: 5.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.320 | TFLOPs: 42.16 | +[default7]: iteration 2705/ 6200 | consumed samples: 2769920 | consumed tokens: 5672796160 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.789808E+00 | loss scale: 2048.0 | grad norm: 5.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.059 | TFLOPs: 42.08 | +[default7]: iteration 2706/ 6200 | consumed samples: 2770944 | consumed tokens: 5674893312 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808430E+00 | loss scale: 2048.0 | grad norm: 6.917 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.364 | TFLOPs: 42.17 | +[default7]: iteration 2707/ 6200 | consumed samples: 2771968 | consumed tokens: 5676990464 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786830E+00 | loss scale: 2048.0 | grad norm: 6.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.064 | TFLOPs: 42.08 | +[default7]: iteration 2708/ 6200 | consumed samples: 2772992 | consumed tokens: 5679087616 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799179E+00 | loss scale: 2048.0 | grad norm: 5.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.763 | TFLOPs: 42.30 | +[default7]: iteration 2709/ 6200 | consumed samples: 2774016 | consumed tokens: 5681184768 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.773702E+00 | loss scale: 2048.0 | grad norm: 6.010 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.245 | TFLOPs: 42.14 | +[default7]: iteration 2710/ 6200 | consumed samples: 2775040 | consumed tokens: 5683281920 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775620E+00 | loss scale: 2048.0 | grad norm: 6.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.494 | TFLOPs: 42.21 | +[default7]: iteration 2711/ 6200 | consumed samples: 2776064 | consumed tokens: 5685379072 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781044E+00 | loss scale: 2048.0 | grad norm: 5.783 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.958 | TFLOPs: 42.05 | +[default7]: iteration 2712/ 6200 | consumed samples: 2777088 | consumed tokens: 5687476224 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.784389E+00 | loss scale: 2048.0 | grad norm: 5.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.171 | TFLOPs: 42.12 | +[default7]: iteration 2713/ 6200 | consumed samples: 2778112 | consumed tokens: 5689573376 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.796046E+00 | loss scale: 2048.0 | grad norm: 5.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.120 | TFLOPs: 42.10 | +[default7]: iteration 2714/ 6200 | consumed samples: 2779136 | consumed tokens: 5691670528 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.803211E+00 | loss scale: 2048.0 | grad norm: 6.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.964 | TFLOPs: 42.36 | +[default7]: iteration 2715/ 6200 | consumed samples: 2780160 | consumed tokens: 5693767680 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.790826E+00 | loss scale: 2048.0 | grad norm: 5.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.185 | TFLOPs: 42.42 | +[default7]: iteration 2716/ 6200 | consumed samples: 2781184 | consumed tokens: 5695864832 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.783686E+00 | loss scale: 2048.0 | grad norm: 5.859 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.258 | TFLOPs: 42.45 | +[default7]: iteration 2717/ 6200 | consumed samples: 2782208 | consumed tokens: 5697961984 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795909E+00 | loss scale: 2048.0 | grad norm: 5.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.849 | TFLOPs: 42.32 | +[default7]: iteration 2718/ 6200 | consumed samples: 2783232 | consumed tokens: 5700059136 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.785791E+00 | loss scale: 2048.0 | grad norm: 5.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.969 | TFLOPs: 42.36 | +[default7]: iteration 2719/ 6200 | consumed samples: 2784256 | consumed tokens: 5702156288 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.785026E+00 | loss scale: 2048.0 | grad norm: 5.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.274 | TFLOPs: 42.15 | +[default7]: iteration 2720/ 6200 | consumed samples: 2785280 | consumed tokens: 5704253440 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770986E+00 | loss scale: 2048.0 | grad norm: 6.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.101 | TFLOPs: 42.09 | +[default7]: iteration 2721/ 6200 | consumed samples: 2786304 | consumed tokens: 5706350592 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.791577E+00 | loss scale: 2048.0 | grad norm: 6.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.567 | TFLOPs: 42.24 | +[default7]: iteration 2722/ 6200 | consumed samples: 2787328 | consumed tokens: 5708447744 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.790730E+00 | loss scale: 2048.0 | grad norm: 6.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.481 | TFLOPs: 42.21 | +[default7]: iteration 2723/ 6200 | consumed samples: 2788352 | consumed tokens: 5710544896 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.810709E+00 | loss scale: 2048.0 | grad norm: 5.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.094 | TFLOPs: 42.09 | +[default7]: iteration 2724/ 6200 | consumed samples: 2789376 | consumed tokens: 5712642048 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.777410E+00 | loss scale: 2048.0 | grad norm: 7.696 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.651 | TFLOPs: 42.26 | +[default7]: iteration 2725/ 6200 | consumed samples: 2790400 | consumed tokens: 5714739200 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.771128E+00 | loss scale: 2048.0 | grad norm: 5.780 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.345 | TFLOPs: 42.17 | +[default7]: iteration 2726/ 6200 | consumed samples: 2791424 | consumed tokens: 5716836352 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.803179E+00 | loss scale: 2048.0 | grad norm: 5.757 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.010 | TFLOPs: 42.07 | +[default7]: iteration 2727/ 6200 | consumed samples: 2792448 | consumed tokens: 5718933504 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775172E+00 | loss scale: 2048.0 | grad norm: 5.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.201 | TFLOPs: 42.12 | +[default7]: iteration 2728/ 6200 | consumed samples: 2793472 | consumed tokens: 5721030656 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.760861E+00 | loss scale: 2048.0 | grad norm: 5.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.318 | TFLOPs: 42.16 | +[default7]: iteration 2729/ 6200 | consumed samples: 2794496 | consumed tokens: 5723127808 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786309E+00 | loss scale: 2048.0 | grad norm: 5.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.250 | TFLOPs: 42.14 | +[default7]: iteration 2730/ 6200 | consumed samples: 2795520 | consumed tokens: 5725224960 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.783973E+00 | loss scale: 2048.0 | grad norm: 5.900 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.286 | TFLOPs: 42.15 | +[default7]: iteration 2731/ 6200 | consumed samples: 2796544 | consumed tokens: 5727322112 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772824E+00 | loss scale: 2048.0 | grad norm: 6.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.316 | TFLOPs: 42.16 | +[default7]: iteration 2732/ 6200 | consumed samples: 2797568 | consumed tokens: 5729419264 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.766296E+00 | loss scale: 2048.0 | grad norm: 5.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.257 | TFLOPs: 42.14 | +[default7]: iteration 2733/ 6200 | consumed samples: 2798592 | consumed tokens: 5731516416 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.787419E+00 | loss scale: 2048.0 | grad norm: 5.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.028 | TFLOPs: 42.07 | +[default7]: iteration 2734/ 6200 | consumed samples: 2799616 | consumed tokens: 5733613568 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.764154E+00 | loss scale: 2048.0 | grad norm: 5.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.134 | TFLOPs: 42.10 | +[default7]: iteration 2735/ 6200 | consumed samples: 2800640 | consumed tokens: 5735710720 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.788223E+00 | loss scale: 2048.0 | grad norm: 5.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.580 | TFLOPs: 41.94 | +[default7]: iteration 2736/ 6200 | consumed samples: 2801664 | consumed tokens: 5737807872 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.767416E+00 | loss scale: 2048.0 | grad norm: 5.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.940 | TFLOPs: 42.05 | +[default7]: iteration 2737/ 6200 | consumed samples: 2802688 | consumed tokens: 5739905024 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795057E+00 | loss scale: 2048.0 | grad norm: 5.967 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.358 | TFLOPs: 42.17 | +[default7]: iteration 2738/ 6200 | consumed samples: 2803712 | consumed tokens: 5742002176 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772070E+00 | loss scale: 2048.0 | grad norm: 5.813 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.026 | TFLOPs: 42.38 | +[default7]: iteration 2739/ 6200 | consumed samples: 2804736 | consumed tokens: 5744099328 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.768475E+00 | loss scale: 2048.0 | grad norm: 5.014 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.536 | TFLOPs: 42.23 | +[default7]: iteration 2740/ 6200 | consumed samples: 2805760 | consumed tokens: 5746196480 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.777103E+00 | loss scale: 2048.0 | grad norm: 5.768 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.665 | TFLOPs: 42.27 | +[default7]: iteration 2741/ 6200 | consumed samples: 2806784 | consumed tokens: 5748293632 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.774966E+00 | loss scale: 2048.0 | grad norm: 6.988 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.101 | TFLOPs: 42.09 | +[default7]: iteration 2742/ 6200 | consumed samples: 2807808 | consumed tokens: 5750390784 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.825954E+00 | loss scale: 2048.0 | grad norm: 6.863 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.028 | TFLOPs: 42.07 | +[default7]: iteration 2743/ 6200 | consumed samples: 2808832 | consumed tokens: 5752487936 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772781E+00 | loss scale: 2048.0 | grad norm: 6.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.964 | TFLOPs: 42.05 | +[default7]: iteration 2744/ 6200 | consumed samples: 2809856 | consumed tokens: 5754585088 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.788387E+00 | loss scale: 2048.0 | grad norm: 6.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.185 | TFLOPs: 42.12 | +[default7]: iteration 2745/ 6200 | consumed samples: 2810880 | consumed tokens: 5756682240 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.796604E+00 | loss scale: 2048.0 | grad norm: 5.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.494 | TFLOPs: 42.21 | +[default7]: iteration 2746/ 6200 | consumed samples: 2811904 | consumed tokens: 5758779392 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.773322E+00 | loss scale: 2048.0 | grad norm: 4.826 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.997 | TFLOPs: 42.06 | +[default7]: iteration 2747/ 6200 | consumed samples: 2812928 | consumed tokens: 5760876544 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808010E+00 | loss scale: 2048.0 | grad norm: 5.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.000 | TFLOPs: 42.06 | +[default7]: iteration 2748/ 6200 | consumed samples: 2813952 | consumed tokens: 5762973696 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.810997E+00 | loss scale: 2048.0 | grad norm: 5.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.172 | TFLOPs: 42.12 | +[default7]: iteration 2749/ 6200 | consumed samples: 2814976 | consumed tokens: 5765070848 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.818385E+00 | loss scale: 2048.0 | grad norm: 5.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.146 | TFLOPs: 42.11 | +[default7]: iteration 2750/ 6200 | consumed samples: 2816000 | consumed tokens: 5767168000 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.785710E+00 | loss scale: 2048.0 | grad norm: 5.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.470 | TFLOPs: 42.21 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 2750 | lm loss value: 3.536659E+00 | lm loss PPL: 3.435197E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 2750 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 18:38:43,521] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2750 is begin to save! +[default0]:[2022-10-06 18:38:43,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_01-model_00-model_states.pt... +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 2750 | lm loss value: 1.679020E+00 | lm loss PPL: 5.360302E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:[2022-10-06 18:38:43,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:43,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,000] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,035] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,063] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,118] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,174] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,228] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,255] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,283] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,309] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,336] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,391] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,444] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,499] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,553] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,579] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,633] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 18:38:44,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 18:38:44,635] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/mp_rank_00_model_states.pt +[default0]:[2022-10-06 18:38:44,635] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 18:38:44,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/mp_rank_00_model_states.pt. +[default1]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 18:38:44,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 18:38:44,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 18:38:44,844] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 18:38:44,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 18:38:44,913] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 18:38:44,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 18:38:44,874] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 18:38:44,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 18:38:44,871] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 18:38:44,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 18:38:44,924] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 18:38:44,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 18:38:44,853] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 18:38:44,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 18:38:44,848] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 18:38:44,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 18:38:44,944] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 18:38:44,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 18:38:44,879] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 18:38:44,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 18:38:44,948] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 18:38:44,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 18:38:44,924] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 18:38:44,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 18:38:44,955] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 18:38:44,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 18:38:44,958] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 18:38:44,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 18:38:44,947] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 18:38:44,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 18:38:44,963] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 18:38:44,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 18:38:44,934] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 18:38:44,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 18:38:44,964] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 18:38:44,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 18:38:44,966] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 18:38:44,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 18:38:44,942] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 18:38:44,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 18:38:44,974] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 18:38:44,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 18:38:44,945] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 18:38:44,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 18:38:44,957] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 18:38:44,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 18:38:44,964] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 18:38:44,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 18:38:44,992] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 18:38:44,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 18:38:44,962] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 18:38:44,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 18:38:44,992] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 18:38:44,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 18:38:44,978] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 18:38:45,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 18:38:45,039] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 18:38:45,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 18:38:45,089] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default5]:[2022-10-06 18:38:45,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 18:38:45,097] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default1]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default2]:[2022-10-06 18:38:45,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 18:38:45,108] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default0]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default7]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default7]:time (ms) | save-checkpoint: 1647.88 +[default7]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default1]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default4]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default7]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default6]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default3]:[2022-10-06 18:38:45,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 18:38:45,168] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step2750/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default7]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default6]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default3]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default6]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default4]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default2]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default0]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default0]: successfully saved checkpoint at iteration 2750 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default3]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default4]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default4]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default3]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default2]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default5]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default2]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default1]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default5]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default1]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default0]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default0]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default6]:[2022-10-06 18:38:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2750 is ready now! +[default7]: iteration 2751/ 6200 | consumed samples: 2817024 | consumed tokens: 5769265152 | elapsed time per iteration (s): 53.58 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795701E+00 | loss scale: 2048.0 | grad norm: 5.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.112 | TFLOPs: 5.83 | +[default7]: iteration 2752/ 6200 | consumed samples: 2818048 | consumed tokens: 5771362304 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.771971E+00 | loss scale: 2048.0 | grad norm: 6.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.534 | TFLOPs: 41.92 | +[default7]: iteration 2753/ 6200 | consumed samples: 2819072 | consumed tokens: 5773459456 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.779671E+00 | loss scale: 2048.0 | grad norm: 5.646 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.541 | TFLOPs: 41.92 | +[default7]: iteration 2754/ 6200 | consumed samples: 2820096 | consumed tokens: 5775556608 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.789894E+00 | loss scale: 2048.0 | grad norm: 6.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.626 | TFLOPs: 41.95 | +[default7]: iteration 2755/ 6200 | consumed samples: 2821120 | consumed tokens: 5777653760 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.787761E+00 | loss scale: 2048.0 | grad norm: 6.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.173 | TFLOPs: 42.12 | +[default7]: iteration 2756/ 6200 | consumed samples: 2822144 | consumed tokens: 5779750912 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.783224E+00 | loss scale: 2048.0 | grad norm: 5.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.425 | TFLOPs: 42.19 | +[default7]: iteration 2757/ 6200 | consumed samples: 2823168 | consumed tokens: 5781848064 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.764866E+00 | loss scale: 2048.0 | grad norm: 6.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.580 | TFLOPs: 42.24 | +[default7]: iteration 2758/ 6200 | consumed samples: 2824192 | consumed tokens: 5783945216 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.792116E+00 | loss scale: 2048.0 | grad norm: 4.553 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.647 | TFLOPs: 42.26 | +[default7]: iteration 2759/ 6200 | consumed samples: 2825216 | consumed tokens: 5786042368 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.780096E+00 | loss scale: 2048.0 | grad norm: 5.719 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.641 | TFLOPs: 42.26 | +[default7]: iteration 2760/ 6200 | consumed samples: 2826240 | consumed tokens: 5788139520 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.779712E+00 | loss scale: 2048.0 | grad norm: 6.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.279 | TFLOPs: 42.15 | +[default7]: iteration 2761/ 6200 | consumed samples: 2827264 | consumed tokens: 5790236672 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.766392E+00 | loss scale: 2048.0 | grad norm: 5.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.394 | TFLOPs: 42.18 | +[default7]: iteration 2762/ 6200 | consumed samples: 2828288 | consumed tokens: 5792333824 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799151E+00 | loss scale: 2048.0 | grad norm: 5.937 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.562 | TFLOPs: 42.23 | +[default7]: iteration 2763/ 6200 | consumed samples: 2829312 | consumed tokens: 5794430976 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782249E+00 | loss scale: 2048.0 | grad norm: 5.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.607 | TFLOPs: 42.25 | +[default7]: iteration 2764/ 6200 | consumed samples: 2830336 | consumed tokens: 5796528128 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795010E+00 | loss scale: 2048.0 | grad norm: 6.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.489 | TFLOPs: 42.21 | +[default7]: iteration 2765/ 6200 | consumed samples: 2831360 | consumed tokens: 5798625280 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793154E+00 | loss scale: 2048.0 | grad norm: 5.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.267 | TFLOPs: 42.15 | +[default7]: iteration 2766/ 6200 | consumed samples: 2832384 | consumed tokens: 5800722432 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.776121E+00 | loss scale: 2048.0 | grad norm: 5.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.915 | TFLOPs: 42.34 | +[default7]: iteration 2767/ 6200 | consumed samples: 2833408 | consumed tokens: 5802819584 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.787837E+00 | loss scale: 2048.0 | grad norm: 5.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.535 | TFLOPs: 42.23 | +[default7]: iteration 2768/ 6200 | consumed samples: 2834432 | consumed tokens: 5804916736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772769E+00 | loss scale: 2048.0 | grad norm: 5.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.551 | TFLOPs: 42.23 | +[default7]: iteration 2769/ 6200 | consumed samples: 2835456 | consumed tokens: 5807013888 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.767177E+00 | loss scale: 2048.0 | grad norm: 5.637 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.621 | TFLOPs: 42.25 | +[default7]: iteration 2770/ 6200 | consumed samples: 2836480 | consumed tokens: 5809111040 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781861E+00 | loss scale: 2048.0 | grad norm: 7.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.567 | TFLOPs: 42.24 | +[default7]: iteration 2771/ 6200 | consumed samples: 2837504 | consumed tokens: 5811208192 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782010E+00 | loss scale: 2048.0 | grad norm: 5.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.899 | TFLOPs: 42.03 | +[default7]: iteration 2772/ 6200 | consumed samples: 2838528 | consumed tokens: 5813305344 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.767959E+00 | loss scale: 2048.0 | grad norm: 6.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.396 | TFLOPs: 42.18 | +[default7]: iteration 2773/ 6200 | consumed samples: 2839552 | consumed tokens: 5815402496 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770347E+00 | loss scale: 2048.0 | grad norm: 5.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.327 | TFLOPs: 42.16 | +[default7]: iteration 2774/ 6200 | consumed samples: 2840576 | consumed tokens: 5817499648 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.776052E+00 | loss scale: 2048.0 | grad norm: 5.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.405 | TFLOPs: 42.19 | +[default7]: iteration 2775/ 6200 | consumed samples: 2841600 | consumed tokens: 5819596800 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781437E+00 | loss scale: 2048.0 | grad norm: 8.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.315 | TFLOPs: 42.16 | +[default7]: iteration 2776/ 6200 | consumed samples: 2842624 | consumed tokens: 5821693952 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.763062E+00 | loss scale: 2048.0 | grad norm: 6.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.435 | TFLOPs: 42.20 | +[default7]: iteration 2777/ 6200 | consumed samples: 2843648 | consumed tokens: 5823791104 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.771424E+00 | loss scale: 2048.0 | grad norm: 5.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.074 | TFLOPs: 42.09 | +[default7]: iteration 2778/ 6200 | consumed samples: 2844672 | consumed tokens: 5825888256 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747703E+00 | loss scale: 2048.0 | grad norm: 7.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.404 | TFLOPs: 42.19 | +[default7]: iteration 2779/ 6200 | consumed samples: 2845696 | consumed tokens: 5827985408 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.785179E+00 | loss scale: 2048.0 | grad norm: 5.582 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.235 | TFLOPs: 42.14 | +[default7]: iteration 2780/ 6200 | consumed samples: 2846720 | consumed tokens: 5830082560 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.769395E+00 | loss scale: 2048.0 | grad norm: 5.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.424 | TFLOPs: 42.19 | +[default7]: iteration 2781/ 6200 | consumed samples: 2847744 | consumed tokens: 5832179712 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782975E+00 | loss scale: 2048.0 | grad norm: 4.718 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.445 | TFLOPs: 42.20 | +[default7]: iteration 2782/ 6200 | consumed samples: 2848768 | consumed tokens: 5834276864 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795413E+00 | loss scale: 2048.0 | grad norm: 6.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.553 | TFLOPs: 42.23 | +[default7]: iteration 2783/ 6200 | consumed samples: 2849792 | consumed tokens: 5836374016 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746851E+00 | loss scale: 2048.0 | grad norm: 5.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.406 | TFLOPs: 42.19 | +[default7]: iteration 2784/ 6200 | consumed samples: 2850816 | consumed tokens: 5838471168 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.755254E+00 | loss scale: 2048.0 | grad norm: 5.739 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.320 | TFLOPs: 42.16 | +[default7]: iteration 2785/ 6200 | consumed samples: 2851840 | consumed tokens: 5840568320 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.792108E+00 | loss scale: 2048.0 | grad norm: 5.945 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.480 | TFLOPs: 42.21 | +[default7]: iteration 2786/ 6200 | consumed samples: 2852864 | consumed tokens: 5842665472 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772359E+00 | loss scale: 2048.0 | grad norm: 6.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.472 | TFLOPs: 42.21 | +[default7]: iteration 2787/ 6200 | consumed samples: 2853888 | consumed tokens: 5844762624 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.792039E+00 | loss scale: 2048.0 | grad norm: 6.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.538 | TFLOPs: 42.23 | +[default7]: iteration 2788/ 6200 | consumed samples: 2854912 | consumed tokens: 5846859776 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.779641E+00 | loss scale: 2048.0 | grad norm: 5.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.429 | TFLOPs: 42.19 | +[default7]: iteration 2789/ 6200 | consumed samples: 2855936 | consumed tokens: 5848956928 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778774E+00 | loss scale: 2048.0 | grad norm: 6.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.211 | TFLOPs: 42.13 | +[default7]: iteration 2790/ 6200 | consumed samples: 2856960 | consumed tokens: 5851054080 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.762666E+00 | loss scale: 2048.0 | grad norm: 5.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.257 | TFLOPs: 42.14 | +[default7]: iteration 2791/ 6200 | consumed samples: 2857984 | consumed tokens: 5853151232 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.787499E+00 | loss scale: 2048.0 | grad norm: 6.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.323 | TFLOPs: 42.16 | +[default7]: iteration 2792/ 6200 | consumed samples: 2859008 | consumed tokens: 5855248384 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778726E+00 | loss scale: 4096.0 | grad norm: 2.717 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.318 | TFLOPs: 42.16 | +[default7]: iteration 2793/ 6200 | consumed samples: 2860032 | consumed tokens: 5857345536 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770381E+00 | loss scale: 4096.0 | grad norm: 5.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.481 | TFLOPs: 42.21 | +[default7]: iteration 2794/ 6200 | consumed samples: 2861056 | consumed tokens: 5859442688 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.798085E+00 | loss scale: 4096.0 | grad norm: 5.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.356 | TFLOPs: 42.17 | +[default7]: iteration 2795/ 6200 | consumed samples: 2862080 | consumed tokens: 5861539840 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786060E+00 | loss scale: 4096.0 | grad norm: 5.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.743 | TFLOPs: 42.29 | +[default7]: iteration 2796/ 6200 | consumed samples: 2863104 | consumed tokens: 5863636992 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.816654E+00 | loss scale: 4096.0 | grad norm: 5.747 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.951 | TFLOPs: 42.35 | +[default7]: iteration 2797/ 6200 | consumed samples: 2864128 | consumed tokens: 5865734144 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.779110E+00 | loss scale: 4096.0 | grad norm: 5.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.896 | TFLOPs: 42.34 | +[default7]: iteration 2798/ 6200 | consumed samples: 2865152 | consumed tokens: 5867831296 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781285E+00 | loss scale: 4096.0 | grad norm: 5.781 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.769 | TFLOPs: 42.30 | +[default7]: iteration 2799/ 6200 | consumed samples: 2866176 | consumed tokens: 5869928448 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.779018E+00 | loss scale: 4096.0 | grad norm: 7.649 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.854 | TFLOPs: 42.32 | +[default7]: iteration 2800/ 6200 | consumed samples: 2867200 | consumed tokens: 5872025600 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765789E+00 | loss scale: 4096.0 | grad norm: 5.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.924 | TFLOPs: 42.35 | +[default7]: iteration 2801/ 6200 | consumed samples: 2868224 | consumed tokens: 5874122752 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.787128E+00 | loss scale: 4096.0 | grad norm: 5.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.762 | TFLOPs: 42.30 | +[default7]: iteration 2802/ 6200 | consumed samples: 2869248 | consumed tokens: 5876219904 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781853E+00 | loss scale: 4096.0 | grad norm: 5.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.591 | TFLOPs: 42.24 | +[default7]: iteration 2803/ 6200 | consumed samples: 2870272 | consumed tokens: 5878317056 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.774171E+00 | loss scale: 4096.0 | grad norm: 6.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.895 | TFLOPs: 42.34 | +[default7]: iteration 2804/ 6200 | consumed samples: 2871296 | consumed tokens: 5880414208 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.766615E+00 | loss scale: 4096.0 | grad norm: 5.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.880 | TFLOPs: 42.33 | +[default7]: iteration 2805/ 6200 | consumed samples: 2872320 | consumed tokens: 5882511360 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770273E+00 | loss scale: 4096.0 | grad norm: 5.023 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.926 | TFLOPs: 42.35 | +[default7]: iteration 2806/ 6200 | consumed samples: 2873344 | consumed tokens: 5884608512 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795091E+00 | loss scale: 4096.0 | grad norm: 5.803 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.976 | TFLOPs: 42.36 | +[default7]: iteration 2807/ 6200 | consumed samples: 2874368 | consumed tokens: 5886705664 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.788737E+00 | loss scale: 4096.0 | grad norm: 6.018 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.557 | TFLOPs: 42.23 | +[default7]: iteration 2808/ 6200 | consumed samples: 2875392 | consumed tokens: 5888802816 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786429E+00 | loss scale: 4096.0 | grad norm: 6.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.666 | TFLOPs: 42.27 | +[default7]: iteration 2809/ 6200 | consumed samples: 2876416 | consumed tokens: 5890899968 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.767494E+00 | loss scale: 4096.0 | grad norm: 5.793 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.022 | TFLOPs: 42.38 | +[default7]: iteration 2810/ 6200 | consumed samples: 2877440 | consumed tokens: 5892997120 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770229E+00 | loss scale: 4096.0 | grad norm: 6.036 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.897 | TFLOPs: 42.34 | +[default7]: iteration 2811/ 6200 | consumed samples: 2878464 | consumed tokens: 5895094272 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.794848E+00 | loss scale: 4096.0 | grad norm: 6.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.558 | TFLOPs: 42.23 | +[default7]: iteration 2812/ 6200 | consumed samples: 2879488 | consumed tokens: 5897191424 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746457E+00 | loss scale: 4096.0 | grad norm: 7.060 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.687 | TFLOPs: 42.27 | +[default7]: iteration 2813/ 6200 | consumed samples: 2880512 | consumed tokens: 5899288576 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.760584E+00 | loss scale: 4096.0 | grad norm: 5.644 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.410 | TFLOPs: 42.19 | +[default7]: iteration 2814/ 6200 | consumed samples: 2881536 | consumed tokens: 5901385728 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.803428E+00 | loss scale: 4096.0 | grad norm: 5.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.360 | TFLOPs: 42.17 | +[default7]: iteration 2815/ 6200 | consumed samples: 2882560 | consumed tokens: 5903482880 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.774980E+00 | loss scale: 4096.0 | grad norm: 6.038 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.273 | TFLOPs: 42.15 | +[default7]: iteration 2816/ 6200 | consumed samples: 2883584 | consumed tokens: 5905580032 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.767865E+00 | loss scale: 4096.0 | grad norm: 6.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.458 | TFLOPs: 42.20 | +[default7]: iteration 2817/ 6200 | consumed samples: 2884608 | consumed tokens: 5907677184 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.767499E+00 | loss scale: 4096.0 | grad norm: 7.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.230 | TFLOPs: 42.13 | +[default7]: iteration 2818/ 6200 | consumed samples: 2885632 | consumed tokens: 5909774336 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.771642E+00 | loss scale: 4096.0 | grad norm: 6.005 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.072 | TFLOPs: 42.09 | +[default7]: iteration 2819/ 6200 | consumed samples: 2886656 | consumed tokens: 5911871488 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775819E+00 | loss scale: 4096.0 | grad norm: 7.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.362 | TFLOPs: 42.17 | +[default7]: iteration 2820/ 6200 | consumed samples: 2887680 | consumed tokens: 5913968640 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753405E+00 | loss scale: 4096.0 | grad norm: 6.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.349 | TFLOPs: 42.17 | +[default7]: iteration 2821/ 6200 | consumed samples: 2888704 | consumed tokens: 5916065792 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.789592E+00 | loss scale: 4096.0 | grad norm: 7.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.039 | TFLOPs: 42.08 | +[default7]: iteration 2822/ 6200 | consumed samples: 2889728 | consumed tokens: 5918162944 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.771222E+00 | loss scale: 4096.0 | grad norm: 5.734 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.148 | TFLOPs: 42.11 | +[default7]: iteration 2823/ 6200 | consumed samples: 2890752 | consumed tokens: 5920260096 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799882E+00 | loss scale: 4096.0 | grad norm: 7.008 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.881 | TFLOPs: 42.03 | +[default7]: iteration 2824/ 6200 | consumed samples: 2891776 | consumed tokens: 5922357248 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761070E+00 | loss scale: 4096.0 | grad norm: 6.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.144 | TFLOPs: 42.11 | +[default7]: iteration 2825/ 6200 | consumed samples: 2892800 | consumed tokens: 5924454400 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.769120E+00 | loss scale: 4096.0 | grad norm: 4.734 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.132 | TFLOPs: 42.10 | +[default7]: iteration 2826/ 6200 | consumed samples: 2893824 | consumed tokens: 5926551552 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.763988E+00 | loss scale: 4096.0 | grad norm: 5.823 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.934 | TFLOPs: 42.04 | +[default7]: iteration 2827/ 6200 | consumed samples: 2894848 | consumed tokens: 5928648704 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.783207E+00 | loss scale: 4096.0 | grad norm: 7.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.855 | TFLOPs: 42.32 | +[default7]: iteration 2828/ 6200 | consumed samples: 2895872 | consumed tokens: 5930745856 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.768702E+00 | loss scale: 4096.0 | grad norm: 6.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.140 | TFLOPs: 42.41 | +[default7]: iteration 2829/ 6200 | consumed samples: 2896896 | consumed tokens: 5932843008 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.780828E+00 | loss scale: 4096.0 | grad norm: 5.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.956 | TFLOPs: 42.35 | +[default7]: iteration 2830/ 6200 | consumed samples: 2897920 | consumed tokens: 5934940160 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772824E+00 | loss scale: 4096.0 | grad norm: 5.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.813 | TFLOPs: 42.31 | +[default7]: iteration 2831/ 6200 | consumed samples: 2898944 | consumed tokens: 5937037312 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.806659E+00 | loss scale: 4096.0 | grad norm: 5.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.055 | TFLOPs: 42.39 | +[default7]: iteration 2832/ 6200 | consumed samples: 2899968 | consumed tokens: 5939134464 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747777E+00 | loss scale: 4096.0 | grad norm: 6.848 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.091 | TFLOPs: 42.09 | +[default7]: iteration 2833/ 6200 | consumed samples: 2900992 | consumed tokens: 5941231616 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751379E+00 | loss scale: 4096.0 | grad norm: 5.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.183 | TFLOPs: 42.12 | +[default7]: iteration 2834/ 6200 | consumed samples: 2902016 | consumed tokens: 5943328768 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.755375E+00 | loss scale: 4096.0 | grad norm: 5.867 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.337 | TFLOPs: 42.17 | +[default7]: iteration 2835/ 6200 | consumed samples: 2903040 | consumed tokens: 5945425920 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778360E+00 | loss scale: 4096.0 | grad norm: 6.047 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.297 | TFLOPs: 42.15 | +[default7]: iteration 2836/ 6200 | consumed samples: 2904064 | consumed tokens: 5947523072 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.785634E+00 | loss scale: 4096.0 | grad norm: 5.673 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.430 | TFLOPs: 42.19 | +[default7]: iteration 2837/ 6200 | consumed samples: 2905088 | consumed tokens: 5949620224 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.802602E+00 | loss scale: 4096.0 | grad norm: 5.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.189 | TFLOPs: 42.12 | +[default7]: iteration 2838/ 6200 | consumed samples: 2906112 | consumed tokens: 5951717376 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.788549E+00 | loss scale: 4096.0 | grad norm: 5.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.276 | TFLOPs: 42.15 | +[default7]: iteration 2839/ 6200 | consumed samples: 2907136 | consumed tokens: 5953814528 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.777668E+00 | loss scale: 4096.0 | grad norm: 5.555 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.070 | TFLOPs: 42.08 | +[default7]: iteration 2840/ 6200 | consumed samples: 2908160 | consumed tokens: 5955911680 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782059E+00 | loss scale: 4096.0 | grad norm: 5.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.727 | TFLOPs: 42.29 | +[default7]: iteration 2841/ 6200 | consumed samples: 2909184 | consumed tokens: 5958008832 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761878E+00 | loss scale: 4096.0 | grad norm: 5.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.807 | TFLOPs: 42.31 | +[default7]: iteration 2842/ 6200 | consumed samples: 2910208 | consumed tokens: 5960105984 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749989E+00 | loss scale: 4096.0 | grad norm: 5.952 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.921 | TFLOPs: 42.34 | +[default7]: iteration 2843/ 6200 | consumed samples: 2911232 | consumed tokens: 5962203136 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.771413E+00 | loss scale: 4096.0 | grad norm: 5.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.865 | TFLOPs: 42.33 | +[default7]: iteration 2844/ 6200 | consumed samples: 2912256 | consumed tokens: 5964300288 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772597E+00 | loss scale: 4096.0 | grad norm: 6.648 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.990 | TFLOPs: 42.37 | +[default7]: iteration 2845/ 6200 | consumed samples: 2913280 | consumed tokens: 5966397440 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761047E+00 | loss scale: 4096.0 | grad norm: 6.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.670 | TFLOPs: 42.27 | +[default7]: iteration 2846/ 6200 | consumed samples: 2914304 | consumed tokens: 5968494592 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.804760E+00 | loss scale: 4096.0 | grad norm: 5.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.179 | TFLOPs: 42.42 | +[default7]: iteration 2847/ 6200 | consumed samples: 2915328 | consumed tokens: 5970591744 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.798322E+00 | loss scale: 4096.0 | grad norm: 6.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.999 | TFLOPs: 42.37 | +[default7]: iteration 2848/ 6200 | consumed samples: 2916352 | consumed tokens: 5972688896 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770095E+00 | loss scale: 4096.0 | grad norm: 4.934 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.019 | TFLOPs: 42.37 | +[default7]: iteration 2849/ 6200 | consumed samples: 2917376 | consumed tokens: 5974786048 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.792816E+00 | loss scale: 4096.0 | grad norm: 5.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.143 | TFLOPs: 42.41 | +[default7]: iteration 2850/ 6200 | consumed samples: 2918400 | consumed tokens: 5976883200 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778692E+00 | loss scale: 4096.0 | grad norm: 5.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.390 | TFLOPs: 42.49 | +[default7]: iteration 2851/ 6200 | consumed samples: 2919424 | consumed tokens: 5978980352 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775159E+00 | loss scale: 4096.0 | grad norm: 5.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.279 | TFLOPs: 42.45 | +[default7]: iteration 2852/ 6200 | consumed samples: 2920448 | consumed tokens: 5981077504 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749228E+00 | loss scale: 4096.0 | grad norm: 5.818 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.269 | TFLOPs: 42.45 | +[default7]: iteration 2853/ 6200 | consumed samples: 2921472 | consumed tokens: 5983174656 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736228E+00 | loss scale: 4096.0 | grad norm: 5.077 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.254 | TFLOPs: 42.45 | +[default7]: iteration 2854/ 6200 | consumed samples: 2922496 | consumed tokens: 5985271808 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.788342E+00 | loss scale: 4096.0 | grad norm: 6.804 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.269 | TFLOPs: 42.45 | +[default7]: iteration 2855/ 6200 | consumed samples: 2923520 | consumed tokens: 5987368960 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.768647E+00 | loss scale: 4096.0 | grad norm: 7.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.080 | TFLOPs: 42.39 | +[default7]: iteration 2856/ 6200 | consumed samples: 2924544 | consumed tokens: 5989466112 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.769391E+00 | loss scale: 4096.0 | grad norm: 6.712 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.221 | TFLOPs: 42.44 | +[default7]: iteration 2857/ 6200 | consumed samples: 2925568 | consumed tokens: 5991563264 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.797976E+00 | loss scale: 4096.0 | grad norm: 6.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.277 | TFLOPs: 42.45 | +[default7]: iteration 2858/ 6200 | consumed samples: 2926592 | consumed tokens: 5993660416 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.787940E+00 | loss scale: 4096.0 | grad norm: 6.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.157 | TFLOPs: 42.42 | +[default7]: iteration 2859/ 6200 | consumed samples: 2927616 | consumed tokens: 5995757568 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.757114E+00 | loss scale: 4096.0 | grad norm: 6.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.337 | TFLOPs: 42.47 | +[default7]: iteration 2860/ 6200 | consumed samples: 2928640 | consumed tokens: 5997854720 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750395E+00 | loss scale: 4096.0 | grad norm: 5.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.237 | TFLOPs: 42.44 | +[default7]: iteration 2861/ 6200 | consumed samples: 2929664 | consumed tokens: 5999951872 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781829E+00 | loss scale: 4096.0 | grad norm: 4.923 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.243 | TFLOPs: 42.44 | +[default7]: iteration 2862/ 6200 | consumed samples: 2930688 | consumed tokens: 6002049024 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778378E+00 | loss scale: 4096.0 | grad norm: 6.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.087 | TFLOPs: 42.39 | +[default7]: iteration 2863/ 6200 | consumed samples: 2931712 | consumed tokens: 6004146176 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781672E+00 | loss scale: 4096.0 | grad norm: 7.828 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.085 | TFLOPs: 42.39 | +[default7]: iteration 2864/ 6200 | consumed samples: 2932736 | consumed tokens: 6006243328 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.780011E+00 | loss scale: 4096.0 | grad norm: 7.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.251 | TFLOPs: 42.44 | +[default7]: iteration 2865/ 6200 | consumed samples: 2933760 | consumed tokens: 6008340480 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.773801E+00 | loss scale: 4096.0 | grad norm: 6.968 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.976 | TFLOPs: 42.36 | +[default7]: iteration 2866/ 6200 | consumed samples: 2934784 | consumed tokens: 6010437632 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.796389E+00 | loss scale: 4096.0 | grad norm: 6.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.340 | TFLOPs: 42.47 | +[default7]: iteration 2867/ 6200 | consumed samples: 2935808 | consumed tokens: 6012534784 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.780705E+00 | loss scale: 4096.0 | grad norm: 7.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.199 | TFLOPs: 42.43 | +[default7]: iteration 2868/ 6200 | consumed samples: 2936832 | consumed tokens: 6014631936 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793048E+00 | loss scale: 4096.0 | grad norm: 6.961 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.072 | TFLOPs: 42.39 | +[default7]: iteration 2869/ 6200 | consumed samples: 2937856 | consumed tokens: 6016729088 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756770E+00 | loss scale: 4096.0 | grad norm: 6.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.222 | TFLOPs: 42.44 | +[default7]: iteration 2870/ 6200 | consumed samples: 2938880 | consumed tokens: 6018826240 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750211E+00 | loss scale: 4096.0 | grad norm: 6.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.063 | TFLOPs: 42.39 | +[default7]: iteration 2871/ 6200 | consumed samples: 2939904 | consumed tokens: 6020923392 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.759268E+00 | loss scale: 4096.0 | grad norm: 8.902 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.662 | TFLOPs: 42.27 | +[default7]: iteration 2872/ 6200 | consumed samples: 2940928 | consumed tokens: 6023020544 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.768219E+00 | loss scale: 4096.0 | grad norm: 6.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.822 | TFLOPs: 42.31 | +[default7]: iteration 2873/ 6200 | consumed samples: 2941952 | consumed tokens: 6025117696 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.768732E+00 | loss scale: 4096.0 | grad norm: 5.021 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.125 | TFLOPs: 42.41 | +[default7]: iteration 2874/ 6200 | consumed samples: 2942976 | consumed tokens: 6027214848 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.783768E+00 | loss scale: 4096.0 | grad norm: 5.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.083 | TFLOPs: 42.39 | +[default7]: iteration 2875/ 6200 | consumed samples: 2944000 | consumed tokens: 6029312000 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743254E+00 | loss scale: 4096.0 | grad norm: 6.077 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.216 | TFLOPs: 42.43 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 2875 | lm loss value: 3.533631E+00 | lm loss PPL: 3.424809E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 2875 | lm loss value: 1.662384E+00 | lm loss PPL: 5.271865E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 2876/ 6200 | consumed samples: 2945024 | consumed tokens: 6031409152 | elapsed time per iteration (s): 51.82 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.788310E+00 | loss scale: 4096.0 | grad norm: 6.728 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.760 | TFLOPs: 6.02 | +[default7]: iteration 2877/ 6200 | consumed samples: 2946048 | consumed tokens: 6033506304 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.768084E+00 | loss scale: 4096.0 | grad norm: 6.840 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.028 | TFLOPs: 42.38 | +[default7]: iteration 2878/ 6200 | consumed samples: 2947072 | consumed tokens: 6035603456 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.792096E+00 | loss scale: 4096.0 | grad norm: 6.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.982 | TFLOPs: 42.36 | +[default7]: iteration 2879/ 6200 | consumed samples: 2948096 | consumed tokens: 6037700608 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782877E+00 | loss scale: 4096.0 | grad norm: 6.829 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.885 | TFLOPs: 42.33 | +[default7]: iteration 2880/ 6200 | consumed samples: 2949120 | consumed tokens: 6039797760 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772399E+00 | loss scale: 4096.0 | grad norm: 5.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.314 | TFLOPs: 42.46 | +[default7]: iteration 2881/ 6200 | consumed samples: 2950144 | consumed tokens: 6041894912 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782687E+00 | loss scale: 4096.0 | grad norm: 5.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.014 | TFLOPs: 42.37 | +[default7]: iteration 2882/ 6200 | consumed samples: 2951168 | consumed tokens: 6043992064 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.757378E+00 | loss scale: 4096.0 | grad norm: 8.067 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.997 | TFLOPs: 42.37 | +[default7]: iteration 2883/ 6200 | consumed samples: 2952192 | consumed tokens: 6046089216 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743058E+00 | loss scale: 4096.0 | grad norm: 7.743 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.084 | TFLOPs: 42.39 | +[default7]: iteration 2884/ 6200 | consumed samples: 2953216 | consumed tokens: 6048186368 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750033E+00 | loss scale: 4096.0 | grad norm: 5.953 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.845 | TFLOPs: 42.32 | +[default7]: iteration 2885/ 6200 | consumed samples: 2954240 | consumed tokens: 6050283520 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756765E+00 | loss scale: 4096.0 | grad norm: 5.737 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.239 | TFLOPs: 42.44 | +[default7]: iteration 2886/ 6200 | consumed samples: 2955264 | consumed tokens: 6052380672 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.760010E+00 | loss scale: 4096.0 | grad norm: 7.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.992 | TFLOPs: 42.37 | +[default7]: iteration 2887/ 6200 | consumed samples: 2956288 | consumed tokens: 6054477824 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778556E+00 | loss scale: 4096.0 | grad norm: 6.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.311 | TFLOPs: 42.46 | +[default7]: iteration 2888/ 6200 | consumed samples: 2957312 | consumed tokens: 6056574976 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793911E+00 | loss scale: 4096.0 | grad norm: 6.968 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.000 | TFLOPs: 42.37 | +[default7]: iteration 2889/ 6200 | consumed samples: 2958336 | consumed tokens: 6058672128 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735785E+00 | loss scale: 4096.0 | grad norm: 5.840 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.128 | TFLOPs: 42.41 | +[default7]: iteration 2890/ 6200 | consumed samples: 2959360 | consumed tokens: 6060769280 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.760796E+00 | loss scale: 4096.0 | grad norm: 8.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.010 | TFLOPs: 42.37 | +[default7]: iteration 2891/ 6200 | consumed samples: 2960384 | consumed tokens: 6062866432 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770969E+00 | loss scale: 4096.0 | grad norm: 6.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.247 | TFLOPs: 42.44 | +[default7]: iteration 2892/ 6200 | consumed samples: 2961408 | consumed tokens: 6064963584 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.762992E+00 | loss scale: 4096.0 | grad norm: 5.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.278 | TFLOPs: 42.45 | +[default7]: iteration 2893/ 6200 | consumed samples: 2962432 | consumed tokens: 6067060736 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782363E+00 | loss scale: 4096.0 | grad norm: 6.796 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.368 | TFLOPs: 42.48 | +[default7]: iteration 2894/ 6200 | consumed samples: 2963456 | consumed tokens: 6069157888 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.797392E+00 | loss scale: 4096.0 | grad norm: 5.799 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.210 | TFLOPs: 42.43 | +[default7]: iteration 2895/ 6200 | consumed samples: 2964480 | consumed tokens: 6071255040 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.779055E+00 | loss scale: 4096.0 | grad norm: 6.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.256 | TFLOPs: 42.45 | +[default7]: iteration 2896/ 6200 | consumed samples: 2965504 | consumed tokens: 6073352192 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751382E+00 | loss scale: 4096.0 | grad norm: 5.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.138 | TFLOPs: 42.41 | +[default7]: iteration 2897/ 6200 | consumed samples: 2966528 | consumed tokens: 6075449344 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765764E+00 | loss scale: 4096.0 | grad norm: 6.012 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.136 | TFLOPs: 42.41 | +[default7]: iteration 2898/ 6200 | consumed samples: 2967552 | consumed tokens: 6077546496 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782034E+00 | loss scale: 4096.0 | grad norm: 7.900 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.218 | TFLOPs: 42.43 | +[default7]: iteration 2899/ 6200 | consumed samples: 2968576 | consumed tokens: 6079643648 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.768209E+00 | loss scale: 4096.0 | grad norm: 6.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.249 | TFLOPs: 42.44 | +[default7]: iteration 2900/ 6200 | consumed samples: 2969600 | consumed tokens: 6081740800 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748197E+00 | loss scale: 4096.0 | grad norm: 6.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.225 | TFLOPs: 42.44 | +[default7]: iteration 2901/ 6200 | consumed samples: 2970624 | consumed tokens: 6083837952 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.798596E+00 | loss scale: 4096.0 | grad norm: 6.622 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.086 | TFLOPs: 42.39 | +[default7]: iteration 2902/ 6200 | consumed samples: 2971648 | consumed tokens: 6085935104 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772930E+00 | loss scale: 4096.0 | grad norm: 7.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.255 | TFLOPs: 42.45 | +[default7]: iteration 2903/ 6200 | consumed samples: 2972672 | consumed tokens: 6088032256 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770916E+00 | loss scale: 4096.0 | grad norm: 6.043 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.033 | TFLOPs: 42.38 | +[default7]: iteration 2904/ 6200 | consumed samples: 2973696 | consumed tokens: 6090129408 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775150E+00 | loss scale: 4096.0 | grad norm: 6.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.181 | TFLOPs: 42.42 | +[default7]: iteration 2905/ 6200 | consumed samples: 2974720 | consumed tokens: 6092226560 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775370E+00 | loss scale: 4096.0 | grad norm: 7.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.112 | TFLOPs: 42.40 | +[default7]: iteration 2906/ 6200 | consumed samples: 2975744 | consumed tokens: 6094323712 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.757618E+00 | loss scale: 4096.0 | grad norm: 6.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.148 | TFLOPs: 42.41 | +[default7]: iteration 2907/ 6200 | consumed samples: 2976768 | consumed tokens: 6096420864 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.745676E+00 | loss scale: 4096.0 | grad norm: 5.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.138 | TFLOPs: 42.41 | +[default7]: iteration 2908/ 6200 | consumed samples: 2977792 | consumed tokens: 6098518016 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.774386E+00 | loss scale: 4096.0 | grad norm: 7.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.967 | TFLOPs: 42.36 | +[default7]: iteration 2909/ 6200 | consumed samples: 2978816 | consumed tokens: 6100615168 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761054E+00 | loss scale: 4096.0 | grad norm: 7.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.096 | TFLOPs: 42.40 | +[default7]: iteration 2910/ 6200 | consumed samples: 2979840 | consumed tokens: 6102712320 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.764459E+00 | loss scale: 4096.0 | grad norm: 7.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.308 | TFLOPs: 42.46 | +[default7]: iteration 2911/ 6200 | consumed samples: 2980864 | consumed tokens: 6104809472 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.771607E+00 | loss scale: 4096.0 | grad norm: 6.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.263 | TFLOPs: 42.45 | +[default7]: iteration 2912/ 6200 | consumed samples: 2981888 | consumed tokens: 6106906624 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756083E+00 | loss scale: 4096.0 | grad norm: 6.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.150 | TFLOPs: 42.41 | +[default7]: iteration 2913/ 6200 | consumed samples: 2982912 | consumed tokens: 6109003776 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772076E+00 | loss scale: 4096.0 | grad norm: 7.799 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.234 | TFLOPs: 42.44 | +[default7]: iteration 2914/ 6200 | consumed samples: 2983936 | consumed tokens: 6111100928 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775269E+00 | loss scale: 4096.0 | grad norm: 8.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.229 | TFLOPs: 42.44 | +[default7]: iteration 2915/ 6200 | consumed samples: 2984960 | consumed tokens: 6113198080 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.764978E+00 | loss scale: 4096.0 | grad norm: 6.008 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.388 | TFLOPs: 42.49 | +[default7]: iteration 2916/ 6200 | consumed samples: 2985984 | consumed tokens: 6115295232 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.803506E+00 | loss scale: 4096.0 | grad norm: 6.992 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.209 | TFLOPs: 42.43 | +[default7]: iteration 2917/ 6200 | consumed samples: 2987008 | consumed tokens: 6117392384 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772257E+00 | loss scale: 4096.0 | grad norm: 7.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.229 | TFLOPs: 42.44 | +[default7]: iteration 2918/ 6200 | consumed samples: 2988032 | consumed tokens: 6119489536 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.754361E+00 | loss scale: 4096.0 | grad norm: 7.994 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.201 | TFLOPs: 42.43 | +[default7]: iteration 2919/ 6200 | consumed samples: 2989056 | consumed tokens: 6121586688 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749381E+00 | loss scale: 4096.0 | grad norm: 4.946 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.986 | TFLOPs: 42.36 | +[default7]: iteration 2920/ 6200 | consumed samples: 2990080 | consumed tokens: 6123683840 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.768623E+00 | loss scale: 4096.0 | grad norm: 5.763 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.202 | TFLOPs: 42.43 | +[default7]: iteration 2921/ 6200 | consumed samples: 2991104 | consumed tokens: 6125780992 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.800568E+00 | loss scale: 4096.0 | grad norm: 4.985 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.050 | TFLOPs: 42.38 | +[default7]: iteration 2922/ 6200 | consumed samples: 2992128 | consumed tokens: 6127878144 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.792063E+00 | loss scale: 4096.0 | grad norm: 5.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.359 | TFLOPs: 42.48 | +[default7]: iteration 2923/ 6200 | consumed samples: 2993152 | consumed tokens: 6129975296 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.766936E+00 | loss scale: 4096.0 | grad norm: 4.966 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.050 | TFLOPs: 42.38 | +[default7]: iteration 2924/ 6200 | consumed samples: 2994176 | consumed tokens: 6132072448 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.760422E+00 | loss scale: 4096.0 | grad norm: 5.968 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.306 | TFLOPs: 42.46 | +[default7]: iteration 2925/ 6200 | consumed samples: 2995200 | consumed tokens: 6134169600 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786971E+00 | loss scale: 4096.0 | grad norm: 4.913 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.344 | TFLOPs: 42.47 | +[default7]: iteration 2926/ 6200 | consumed samples: 2996224 | consumed tokens: 6136266752 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.767556E+00 | loss scale: 4096.0 | grad norm: 5.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.853 | TFLOPs: 42.32 | +[default7]: iteration 2927/ 6200 | consumed samples: 2997248 | consumed tokens: 6138363904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.760813E+00 | loss scale: 4096.0 | grad norm: 5.834 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.803 | TFLOPs: 42.31 | +[default7]: iteration 2928/ 6200 | consumed samples: 2998272 | consumed tokens: 6140461056 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.754024E+00 | loss scale: 4096.0 | grad norm: 6.072 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.887 | TFLOPs: 42.33 | +[default7]: iteration 2929/ 6200 | consumed samples: 2999296 | consumed tokens: 6142558208 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.776206E+00 | loss scale: 4096.0 | grad norm: 5.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.485 | TFLOPs: 42.21 | +[default7]: iteration 2930/ 6200 | consumed samples: 3000320 | consumed tokens: 6144655360 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.766690E+00 | loss scale: 4096.0 | grad norm: 5.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.372 | TFLOPs: 42.48 | +[default7]: iteration 2931/ 6200 | consumed samples: 3001344 | consumed tokens: 6146752512 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.776672E+00 | loss scale: 4096.0 | grad norm: 6.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.327 | TFLOPs: 42.47 | +[default7]: iteration 2932/ 6200 | consumed samples: 3002368 | consumed tokens: 6148849664 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749030E+00 | loss scale: 4096.0 | grad norm: 5.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.257 | TFLOPs: 42.45 | +[default7]: iteration 2933/ 6200 | consumed samples: 3003392 | consumed tokens: 6150946816 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772917E+00 | loss scale: 4096.0 | grad norm: 5.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.330 | TFLOPs: 42.47 | +[default7]: iteration 2934/ 6200 | consumed samples: 3004416 | consumed tokens: 6153043968 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.764884E+00 | loss scale: 4096.0 | grad norm: 5.620 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.246 | TFLOPs: 42.44 | +[default7]: iteration 2935/ 6200 | consumed samples: 3005440 | consumed tokens: 6155141120 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782804E+00 | loss scale: 4096.0 | grad norm: 6.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.144 | TFLOPs: 42.41 | +[default7]: iteration 2936/ 6200 | consumed samples: 3006464 | consumed tokens: 6157238272 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.758603E+00 | loss scale: 4096.0 | grad norm: 5.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.086 | TFLOPs: 42.39 | +[default7]: iteration 2937/ 6200 | consumed samples: 3007488 | consumed tokens: 6159335424 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.787967E+00 | loss scale: 4096.0 | grad norm: 5.928 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.311 | TFLOPs: 42.46 | +[default7]: iteration 2938/ 6200 | consumed samples: 3008512 | consumed tokens: 6161432576 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.757663E+00 | loss scale: 4096.0 | grad norm: 8.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.251 | TFLOPs: 42.44 | +[default7]: iteration 2939/ 6200 | consumed samples: 3009536 | consumed tokens: 6163529728 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.773573E+00 | loss scale: 4096.0 | grad norm: 5.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.371 | TFLOPs: 42.48 | +[default7]: iteration 2940/ 6200 | consumed samples: 3010560 | consumed tokens: 6165626880 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741081E+00 | loss scale: 4096.0 | grad norm: 5.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.338 | TFLOPs: 42.47 | +[default7]: iteration 2941/ 6200 | consumed samples: 3011584 | consumed tokens: 6167724032 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.808268E+00 | loss scale: 4096.0 | grad norm: 5.606 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.473 | TFLOPs: 42.51 | +[default7]: iteration 2942/ 6200 | consumed samples: 3012608 | consumed tokens: 6169821184 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.768336E+00 | loss scale: 4096.0 | grad norm: 7.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.283 | TFLOPs: 42.45 | +[default7]: iteration 2943/ 6200 | consumed samples: 3013632 | consumed tokens: 6171918336 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.758296E+00 | loss scale: 4096.0 | grad norm: 6.946 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.316 | TFLOPs: 42.46 | +[default7]: iteration 2944/ 6200 | consumed samples: 3014656 | consumed tokens: 6174015488 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.774742E+00 | loss scale: 4096.0 | grad norm: 5.890 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.960 | TFLOPs: 42.36 | +[default7]: iteration 2945/ 6200 | consumed samples: 3015680 | consumed tokens: 6176112640 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778965E+00 | loss scale: 4096.0 | grad norm: 6.049 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.054 | TFLOPs: 42.38 | +[default7]: iteration 2946/ 6200 | consumed samples: 3016704 | consumed tokens: 6178209792 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.774342E+00 | loss scale: 4096.0 | grad norm: 6.063 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.869 | TFLOPs: 42.33 | +[default7]: iteration 2947/ 6200 | consumed samples: 3017728 | consumed tokens: 6180306944 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761244E+00 | loss scale: 4096.0 | grad norm: 5.799 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.960 | TFLOPs: 42.36 | +[default7]: iteration 2948/ 6200 | consumed samples: 3018752 | consumed tokens: 6182404096 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.762505E+00 | loss scale: 4096.0 | grad norm: 5.928 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.816 | TFLOPs: 42.31 | +[default7]: iteration 2949/ 6200 | consumed samples: 3019776 | consumed tokens: 6184501248 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756676E+00 | loss scale: 4096.0 | grad norm: 5.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.984 | TFLOPs: 42.36 | +[default7]: iteration 2950/ 6200 | consumed samples: 3020800 | consumed tokens: 6186598400 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.768092E+00 | loss scale: 4096.0 | grad norm: 4.997 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.254 | TFLOPs: 42.45 | +[default7]: iteration 2951/ 6200 | consumed samples: 3021824 | consumed tokens: 6188695552 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775835E+00 | loss scale: 4096.0 | grad norm: 5.731 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.942 | TFLOPs: 42.35 | +[default7]: iteration 2952/ 6200 | consumed samples: 3022848 | consumed tokens: 6190792704 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.771510E+00 | loss scale: 4096.0 | grad norm: 5.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.190 | TFLOPs: 42.43 | +[default7]: iteration 2953/ 6200 | consumed samples: 3023872 | consumed tokens: 6192889856 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.793679E+00 | loss scale: 4096.0 | grad norm: 5.895 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.211 | TFLOPs: 42.43 | +[default7]: iteration 2954/ 6200 | consumed samples: 3024896 | consumed tokens: 6194987008 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.777118E+00 | loss scale: 4096.0 | grad norm: 5.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.169 | TFLOPs: 42.42 | +[default7]: iteration 2955/ 6200 | consumed samples: 3025920 | consumed tokens: 6197084160 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.776520E+00 | loss scale: 4096.0 | grad norm: 6.739 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.095 | TFLOPs: 42.40 | +[default7]: iteration 2956/ 6200 | consumed samples: 3026944 | consumed tokens: 6199181312 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.755598E+00 | loss scale: 4096.0 | grad norm: 5.569 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.898 | TFLOPs: 42.34 | +[default7]: iteration 2957/ 6200 | consumed samples: 3027968 | consumed tokens: 6201278464 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.758221E+00 | loss scale: 4096.0 | grad norm: 5.893 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.299 | TFLOPs: 42.46 | +[default7]: iteration 2958/ 6200 | consumed samples: 3028992 | consumed tokens: 6203375616 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.755968E+00 | loss scale: 4096.0 | grad norm: 6.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.105 | TFLOPs: 42.40 | +[default7]: iteration 2959/ 6200 | consumed samples: 3030016 | consumed tokens: 6205472768 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.762423E+00 | loss scale: 4096.0 | grad norm: 4.993 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.091 | TFLOPs: 42.40 | +[default7]: iteration 2960/ 6200 | consumed samples: 3031040 | consumed tokens: 6207569920 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750035E+00 | loss scale: 4096.0 | grad norm: 5.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.267 | TFLOPs: 42.45 | +[default7]: iteration 2961/ 6200 | consumed samples: 3032064 | consumed tokens: 6209667072 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.788240E+00 | loss scale: 4096.0 | grad norm: 5.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.450 | TFLOPs: 42.51 | +[default7]: iteration 2962/ 6200 | consumed samples: 3033088 | consumed tokens: 6211764224 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778558E+00 | loss scale: 4096.0 | grad norm: 5.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.425 | TFLOPs: 42.50 | +[default7]: iteration 2963/ 6200 | consumed samples: 3034112 | consumed tokens: 6213861376 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.757071E+00 | loss scale: 4096.0 | grad norm: 5.814 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.334 | TFLOPs: 42.47 | +[default7]: iteration 2964/ 6200 | consumed samples: 3035136 | consumed tokens: 6215958528 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.763052E+00 | loss scale: 4096.0 | grad norm: 5.057 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.437 | TFLOPs: 42.50 | +[default7]: iteration 2965/ 6200 | consumed samples: 3036160 | consumed tokens: 6218055680 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.818291E+00 | loss scale: 4096.0 | grad norm: 4.823 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.520 | TFLOPs: 42.53 | +[default7]: iteration 2966/ 6200 | consumed samples: 3037184 | consumed tokens: 6220152832 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746954E+00 | loss scale: 4096.0 | grad norm: 5.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.562 | TFLOPs: 42.54 | +[default7]: iteration 2967/ 6200 | consumed samples: 3038208 | consumed tokens: 6222249984 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.759873E+00 | loss scale: 4096.0 | grad norm: 6.961 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.396 | TFLOPs: 42.49 | +[default7]: iteration 2968/ 6200 | consumed samples: 3039232 | consumed tokens: 6224347136 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778847E+00 | loss scale: 4096.0 | grad norm: 6.742 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.129 | TFLOPs: 42.41 | +[default7]: iteration 2969/ 6200 | consumed samples: 3040256 | consumed tokens: 6226444288 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741679E+00 | loss scale: 4096.0 | grad norm: 5.709 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.079 | TFLOPs: 42.39 | +[default7]: iteration 2970/ 6200 | consumed samples: 3041280 | consumed tokens: 6228541440 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.797229E+00 | loss scale: 4096.0 | grad norm: 5.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.703 | TFLOPs: 42.28 | +[default7]: iteration 2971/ 6200 | consumed samples: 3042304 | consumed tokens: 6230638592 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746113E+00 | loss scale: 4096.0 | grad norm: 6.763 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.050 | TFLOPs: 42.38 | +[default7]: iteration 2972/ 6200 | consumed samples: 3043328 | consumed tokens: 6232735744 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.773297E+00 | loss scale: 4096.0 | grad norm: 5.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.177 | TFLOPs: 42.42 | +[default7]: iteration 2973/ 6200 | consumed samples: 3044352 | consumed tokens: 6234832896 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782172E+00 | loss scale: 4096.0 | grad norm: 4.911 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.126 | TFLOPs: 42.41 | +[default7]: iteration 2974/ 6200 | consumed samples: 3045376 | consumed tokens: 6236930048 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.768061E+00 | loss scale: 4096.0 | grad norm: 5.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.147 | TFLOPs: 42.41 | +[default7]: iteration 2975/ 6200 | consumed samples: 3046400 | consumed tokens: 6239027200 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.729455E+00 | loss scale: 4096.0 | grad norm: 5.619 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.513 | TFLOPs: 42.52 | +[default7]: iteration 2976/ 6200 | consumed samples: 3047424 | consumed tokens: 6241124352 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756582E+00 | loss scale: 4096.0 | grad norm: 5.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.472 | TFLOPs: 42.51 | +[default7]: iteration 2977/ 6200 | consumed samples: 3048448 | consumed tokens: 6243221504 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781244E+00 | loss scale: 4096.0 | grad norm: 5.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.293 | TFLOPs: 42.46 | +[default7]: iteration 2978/ 6200 | consumed samples: 3049472 | consumed tokens: 6245318656 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.758126E+00 | loss scale: 4096.0 | grad norm: 5.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.274 | TFLOPs: 42.45 | +[default7]: iteration 2979/ 6200 | consumed samples: 3050496 | consumed tokens: 6247415808 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.764933E+00 | loss scale: 4096.0 | grad norm: 6.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.956 | TFLOPs: 42.35 | +[default7]: iteration 2980/ 6200 | consumed samples: 3051520 | consumed tokens: 6249512960 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.771810E+00 | loss scale: 4096.0 | grad norm: 5.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.985 | TFLOPs: 42.36 | +[default7]: iteration 2981/ 6200 | consumed samples: 3052544 | consumed tokens: 6251610112 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.755234E+00 | loss scale: 4096.0 | grad norm: 5.760 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.196 | TFLOPs: 42.43 | +[default7]: iteration 2982/ 6200 | consumed samples: 3053568 | consumed tokens: 6253707264 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.779959E+00 | loss scale: 4096.0 | grad norm: 5.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.191 | TFLOPs: 42.43 | +[default7]: iteration 2983/ 6200 | consumed samples: 3054592 | consumed tokens: 6255804416 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.766050E+00 | loss scale: 4096.0 | grad norm: 5.806 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.327 | TFLOPs: 42.47 | +[default7]: iteration 2984/ 6200 | consumed samples: 3055616 | consumed tokens: 6257901568 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781663E+00 | loss scale: 4096.0 | grad norm: 5.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.290 | TFLOPs: 42.46 | +[default7]: iteration 2985/ 6200 | consumed samples: 3056640 | consumed tokens: 6259998720 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.780248E+00 | loss scale: 4096.0 | grad norm: 5.661 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.034 | TFLOPs: 42.38 | +[default7]: iteration 2986/ 6200 | consumed samples: 3057664 | consumed tokens: 6262095872 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.757092E+00 | loss scale: 4096.0 | grad norm: 7.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.015 | TFLOPs: 42.37 | +[default7]: iteration 2987/ 6200 | consumed samples: 3058688 | consumed tokens: 6264193024 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.760506E+00 | loss scale: 4096.0 | grad norm: 5.825 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.266 | TFLOPs: 42.45 | +[default7]: iteration 2988/ 6200 | consumed samples: 3059712 | consumed tokens: 6266290176 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.763953E+00 | loss scale: 4096.0 | grad norm: 5.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.079 | TFLOPs: 42.39 | +[default7]: iteration 2989/ 6200 | consumed samples: 3060736 | consumed tokens: 6268387328 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770458E+00 | loss scale: 4096.0 | grad norm: 5.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.360 | TFLOPs: 42.48 | +[default7]: iteration 2990/ 6200 | consumed samples: 3061760 | consumed tokens: 6270484480 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756505E+00 | loss scale: 4096.0 | grad norm: 5.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.820 | TFLOPs: 42.31 | +[default7]: iteration 2991/ 6200 | consumed samples: 3062784 | consumed tokens: 6272581632 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.754537E+00 | loss scale: 4096.0 | grad norm: 4.846 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.323 | TFLOPs: 42.47 | +[default7]: iteration 2992/ 6200 | consumed samples: 3063808 | consumed tokens: 6274678784 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.762414E+00 | loss scale: 4096.0 | grad norm: 5.813 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.069 | TFLOPs: 42.39 | +[default7]: iteration 2993/ 6200 | consumed samples: 3064832 | consumed tokens: 6276775936 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.762891E+00 | loss scale: 4096.0 | grad norm: 6.832 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.018 | TFLOPs: 42.37 | +[default7]: iteration 2994/ 6200 | consumed samples: 3065856 | consumed tokens: 6278873088 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746706E+00 | loss scale: 4096.0 | grad norm: 5.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.227 | TFLOPs: 42.44 | +[default7]: iteration 2995/ 6200 | consumed samples: 3066880 | consumed tokens: 6280970240 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.783204E+00 | loss scale: 4096.0 | grad norm: 6.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.028 | TFLOPs: 42.38 | +[default7]: iteration 2996/ 6200 | consumed samples: 3067904 | consumed tokens: 6283067392 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.745907E+00 | loss scale: 4096.0 | grad norm: 6.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.888 | TFLOPs: 42.33 | +[default7]: iteration 2997/ 6200 | consumed samples: 3068928 | consumed tokens: 6285164544 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765222E+00 | loss scale: 4096.0 | grad norm: 5.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.323 | TFLOPs: 42.47 | +[default7]: iteration 2998/ 6200 | consumed samples: 3069952 | consumed tokens: 6287261696 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740680E+00 | loss scale: 4096.0 | grad norm: 5.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.365 | TFLOPs: 42.48 | +[default7]: iteration 2999/ 6200 | consumed samples: 3070976 | consumed tokens: 6289358848 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.773905E+00 | loss scale: 4096.0 | grad norm: 4.972 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.131 | TFLOPs: 42.41 | +[default7]: iteration 3000/ 6200 | consumed samples: 3072000 | consumed tokens: 6291456000 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.767816E+00 | loss scale: 4096.0 | grad norm: 5.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.155 | TFLOPs: 42.42 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 3000 | lm loss value: 3.541085E+00 | lm loss PPL: 3.450434E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 3000 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 19:10:57,074] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is begin to save! +[default0]:[2022-10-06 19:10:57,079] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_01-model_00-model_states.pt... +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 3000 | lm loss value: 1.649724E+00 | lm loss PPL: 5.205545E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:[2022-10-06 19:10:57,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,496] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,527] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,584] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,611] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,718] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,747] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,775] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,830] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,887] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,915] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:57,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:57,998] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:58,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:58,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:58,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:58,055] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:58,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:58,084] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:58,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:58,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:58,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:58,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:58,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:58,170] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:58,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:58,198] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_26-model_00-model_states.pt... +[default7]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 19:10:58,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:58,226] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 19:10:58,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 19:10:58,228] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/mp_rank_00_model_states.pt +[default0]:[2022-10-06 19:10:58,228] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 19:10:58,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 19:10:58,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 19:10:58,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 19:10:58,461] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 19:10:58,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 19:10:58,441] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 19:10:58,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 19:10:58,446] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 19:10:58,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 19:10:58,443] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 19:10:58,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 19:10:58,501] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 19:10:58,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 19:10:58,443] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 19:10:58,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 19:10:58,455] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 19:10:58,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 19:10:58,449] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 19:10:58,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 19:10:58,455] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 19:10:58,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 19:10:58,545] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 19:10:58,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 19:10:58,544] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 19:10:58,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 19:10:58,565] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 19:10:58,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 19:10:58,509] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 19:10:58,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 19:10:58,539] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 19:10:58,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 19:10:58,544] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 19:10:58,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 19:10:58,558] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 19:10:58,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 19:10:58,552] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 19:10:58,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 19:10:58,558] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 19:10:58,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 19:10:58,580] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 19:10:58,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 19:10:58,554] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 19:10:58,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 19:10:58,571] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 19:10:58,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 19:10:58,519] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 19:10:58,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 19:10:58,543] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 19:10:58,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 19:10:58,550] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 19:10:58,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 19:10:58,540] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 19:10:58,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 19:10:58,554] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 19:10:58,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 19:10:58,553] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 19:10:58,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 19:10:58,628] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]:time (ms) | save-checkpoint: 1578.43 +[default7]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default6]:[2022-10-06 19:10:58,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 19:10:58,618] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default2]:[2022-10-06 19:10:58,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 19:10:58,605] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default1]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default5]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default2]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default6]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default5]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default4]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default6]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default6]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default2]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default4]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default1]:[2022-10-06 19:10:58,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 19:10:58,619] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default4]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default1]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default1]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]: successfully saved checkpoint at iteration 3000 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default4]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default0]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default5]:[2022-10-06 19:10:58,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 19:10:58,651] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3000/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default3]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default5]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default2]:[2022-10-06 19:10:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +[default7]: iteration 3001/ 6200 | consumed samples: 3073024 | consumed tokens: 6293553152 | elapsed time per iteration (s): 53.27 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753372E+00 | loss scale: 4096.0 | grad norm: 5.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.221 | TFLOPs: 5.86 | +[default7]: iteration 3002/ 6200 | consumed samples: 3074048 | consumed tokens: 6295650304 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751127E+00 | loss scale: 4096.0 | grad norm: 6.045 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.881 | TFLOPs: 42.33 | +[default7]: iteration 3003/ 6200 | consumed samples: 3075072 | consumed tokens: 6297747456 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.760698E+00 | loss scale: 4096.0 | grad norm: 5.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.888 | TFLOPs: 42.33 | +[default7]: iteration 3004/ 6200 | consumed samples: 3076096 | consumed tokens: 6299844608 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778818E+00 | loss scale: 4096.0 | grad norm: 5.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.898 | TFLOPs: 42.34 | +[default7]: iteration 3005/ 6200 | consumed samples: 3077120 | consumed tokens: 6301941760 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.754057E+00 | loss scale: 4096.0 | grad norm: 5.600 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.698 | TFLOPs: 42.28 | +[default7]: iteration 3006/ 6200 | consumed samples: 3078144 | consumed tokens: 6304038912 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.755312E+00 | loss scale: 4096.0 | grad norm: 6.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.873 | TFLOPs: 42.33 | +[default7]: iteration 3007/ 6200 | consumed samples: 3079168 | consumed tokens: 6306136064 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752529E+00 | loss scale: 4096.0 | grad norm: 6.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.653 | TFLOPs: 42.26 | +[default7]: iteration 3008/ 6200 | consumed samples: 3080192 | consumed tokens: 6308233216 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765349E+00 | loss scale: 4096.0 | grad norm: 6.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.009 | TFLOPs: 42.37 | +[default7]: iteration 3009/ 6200 | consumed samples: 3081216 | consumed tokens: 6310330368 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.763372E+00 | loss scale: 4096.0 | grad norm: 5.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.944 | TFLOPs: 42.35 | +[default7]: iteration 3010/ 6200 | consumed samples: 3082240 | consumed tokens: 6312427520 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.744188E+00 | loss scale: 4096.0 | grad norm: 7.818 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.691 | TFLOPs: 42.27 | +[default7]: iteration 3011/ 6200 | consumed samples: 3083264 | consumed tokens: 6314524672 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.769867E+00 | loss scale: 4096.0 | grad norm: 7.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.837 | TFLOPs: 42.32 | +[default7]: iteration 3012/ 6200 | consumed samples: 3084288 | consumed tokens: 6316621824 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.763986E+00 | loss scale: 4096.0 | grad norm: 7.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.467 | TFLOPs: 42.51 | +[default7]: iteration 3013/ 6200 | consumed samples: 3085312 | consumed tokens: 6318718976 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753731E+00 | loss scale: 4096.0 | grad norm: 7.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.753 | TFLOPs: 42.29 | +[default7]: iteration 3014/ 6200 | consumed samples: 3086336 | consumed tokens: 6320816128 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.771250E+00 | loss scale: 4096.0 | grad norm: 5.905 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.759 | TFLOPs: 42.29 | +[default7]: iteration 3015/ 6200 | consumed samples: 3087360 | consumed tokens: 6322913280 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786645E+00 | loss scale: 4096.0 | grad norm: 8.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.793 | TFLOPs: 42.31 | +[default7]: iteration 3016/ 6200 | consumed samples: 3088384 | consumed tokens: 6325010432 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.763945E+00 | loss scale: 4096.0 | grad norm: 8.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.883 | TFLOPs: 42.33 | +[default7]: iteration 3017/ 6200 | consumed samples: 3089408 | consumed tokens: 6327107584 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738534E+00 | loss scale: 4096.0 | grad norm: 6.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.062 | TFLOPs: 42.39 | +[default7]: iteration 3018/ 6200 | consumed samples: 3090432 | consumed tokens: 6329204736 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765467E+00 | loss scale: 4096.0 | grad norm: 5.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.095 | TFLOPs: 42.40 | +[default7]: iteration 3019/ 6200 | consumed samples: 3091456 | consumed tokens: 6331301888 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765228E+00 | loss scale: 4096.0 | grad norm: 6.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.892 | TFLOPs: 42.34 | +[default7]: iteration 3020/ 6200 | consumed samples: 3092480 | consumed tokens: 6333399040 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761401E+00 | loss scale: 4096.0 | grad norm: 5.968 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.808 | TFLOPs: 42.31 | +[default7]: iteration 3021/ 6200 | consumed samples: 3093504 | consumed tokens: 6335496192 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.780063E+00 | loss scale: 4096.0 | grad norm: 4.945 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.779 | TFLOPs: 42.30 | +[default7]: iteration 3022/ 6200 | consumed samples: 3094528 | consumed tokens: 6337593344 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.774583E+00 | loss scale: 4096.0 | grad norm: 5.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.864 | TFLOPs: 42.33 | +[default7]: iteration 3023/ 6200 | consumed samples: 3095552 | consumed tokens: 6339690496 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.795546E+00 | loss scale: 4096.0 | grad norm: 6.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.600 | TFLOPs: 42.55 | +[default7]: iteration 3024/ 6200 | consumed samples: 3096576 | consumed tokens: 6341787648 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726219E+00 | loss scale: 4096.0 | grad norm: 4.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.587 | TFLOPs: 42.55 | +[default7]: iteration 3025/ 6200 | consumed samples: 3097600 | consumed tokens: 6343884800 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753241E+00 | loss scale: 4096.0 | grad norm: 4.583 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.524 | TFLOPs: 42.53 | +[default7]: iteration 3026/ 6200 | consumed samples: 3098624 | consumed tokens: 6345981952 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753605E+00 | loss scale: 4096.0 | grad norm: 5.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.422 | TFLOPs: 42.50 | +[default7]: iteration 3027/ 6200 | consumed samples: 3099648 | consumed tokens: 6348079104 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746835E+00 | loss scale: 4096.0 | grad norm: 5.050 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.363 | TFLOPs: 42.48 | +[default7]: iteration 3028/ 6200 | consumed samples: 3100672 | consumed tokens: 6350176256 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.773835E+00 | loss scale: 4096.0 | grad norm: 5.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.602 | TFLOPs: 42.55 | +[default7]: iteration 3029/ 6200 | consumed samples: 3101696 | consumed tokens: 6352273408 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770635E+00 | loss scale: 4096.0 | grad norm: 5.685 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.150 | TFLOPs: 42.41 | +[default7]: iteration 3030/ 6200 | consumed samples: 3102720 | consumed tokens: 6354370560 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743823E+00 | loss scale: 4096.0 | grad norm: 5.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.412 | TFLOPs: 42.49 | +[default7]: iteration 3031/ 6200 | consumed samples: 3103744 | consumed tokens: 6356467712 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743423E+00 | loss scale: 4096.0 | grad norm: 5.901 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.372 | TFLOPs: 42.48 | +[default7]: iteration 3032/ 6200 | consumed samples: 3104768 | consumed tokens: 6358564864 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775508E+00 | loss scale: 4096.0 | grad norm: 6.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.382 | TFLOPs: 42.48 | +[default7]: iteration 3033/ 6200 | consumed samples: 3105792 | consumed tokens: 6360662016 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753174E+00 | loss scale: 4096.0 | grad norm: 5.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.727 | TFLOPs: 42.29 | +[default7]: iteration 3034/ 6200 | consumed samples: 3106816 | consumed tokens: 6362759168 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746474E+00 | loss scale: 4096.0 | grad norm: 6.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.827 | TFLOPs: 42.32 | +[default7]: iteration 3035/ 6200 | consumed samples: 3107840 | consumed tokens: 6364856320 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.754435E+00 | loss scale: 4096.0 | grad norm: 5.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.987 | TFLOPs: 42.36 | +[default7]: iteration 3036/ 6200 | consumed samples: 3108864 | consumed tokens: 6366953472 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753990E+00 | loss scale: 4096.0 | grad norm: 5.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.736 | TFLOPs: 42.29 | +[default7]: iteration 3037/ 6200 | consumed samples: 3109888 | consumed tokens: 6369050624 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.744023E+00 | loss scale: 4096.0 | grad norm: 8.011 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.227 | TFLOPs: 42.44 | +[default7]: iteration 3038/ 6200 | consumed samples: 3110912 | consumed tokens: 6371147776 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749241E+00 | loss scale: 4096.0 | grad norm: 6.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.914 | TFLOPs: 42.34 | +[default7]: iteration 3039/ 6200 | consumed samples: 3111936 | consumed tokens: 6373244928 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.762509E+00 | loss scale: 4096.0 | grad norm: 5.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.648 | TFLOPs: 42.26 | +[default7]: iteration 3040/ 6200 | consumed samples: 3112960 | consumed tokens: 6375342080 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743824E+00 | loss scale: 4096.0 | grad norm: 5.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.855 | TFLOPs: 42.32 | +[default7]: iteration 3041/ 6200 | consumed samples: 3113984 | consumed tokens: 6377439232 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761006E+00 | loss scale: 4096.0 | grad norm: 5.910 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.660 | TFLOPs: 42.26 | +[default7]: iteration 3042/ 6200 | consumed samples: 3115008 | consumed tokens: 6379536384 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.771692E+00 | loss scale: 4096.0 | grad norm: 5.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.018 | TFLOPs: 42.37 | +[default7]: iteration 3043/ 6200 | consumed samples: 3116032 | consumed tokens: 6381633536 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.769662E+00 | loss scale: 4096.0 | grad norm: 5.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.368 | TFLOPs: 42.48 | +[default7]: iteration 3044/ 6200 | consumed samples: 3117056 | consumed tokens: 6383730688 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.757543E+00 | loss scale: 4096.0 | grad norm: 5.835 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.141 | TFLOPs: 42.41 | +[default7]: iteration 3045/ 6200 | consumed samples: 3118080 | consumed tokens: 6385827840 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782515E+00 | loss scale: 4096.0 | grad norm: 5.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.843 | TFLOPs: 42.32 | +[default7]: iteration 3046/ 6200 | consumed samples: 3119104 | consumed tokens: 6387924992 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.758663E+00 | loss scale: 4096.0 | grad norm: 5.748 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.001 | TFLOPs: 42.37 | +[default7]: iteration 3047/ 6200 | consumed samples: 3120128 | consumed tokens: 6390022144 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761638E+00 | loss scale: 4096.0 | grad norm: 5.935 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.946 | TFLOPs: 42.35 | +[default7]: iteration 3048/ 6200 | consumed samples: 3121152 | consumed tokens: 6392119296 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.776037E+00 | loss scale: 4096.0 | grad norm: 4.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.307 | TFLOPs: 42.46 | +[default7]: iteration 3049/ 6200 | consumed samples: 3122176 | consumed tokens: 6394216448 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741141E+00 | loss scale: 4096.0 | grad norm: 5.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.073 | TFLOPs: 42.39 | +[default7]: iteration 3050/ 6200 | consumed samples: 3123200 | consumed tokens: 6396313600 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752382E+00 | loss scale: 4096.0 | grad norm: 5.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.276 | TFLOPs: 42.45 | +[default7]: iteration 3051/ 6200 | consumed samples: 3124224 | consumed tokens: 6398410752 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747338E+00 | loss scale: 4096.0 | grad norm: 4.816 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.609 | TFLOPs: 42.25 | +[default7]: iteration 3052/ 6200 | consumed samples: 3125248 | consumed tokens: 6400507904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741974E+00 | loss scale: 4096.0 | grad norm: 5.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.705 | TFLOPs: 42.28 | +[default7]: iteration 3053/ 6200 | consumed samples: 3126272 | consumed tokens: 6402605056 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765321E+00 | loss scale: 4096.0 | grad norm: 6.011 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.928 | TFLOPs: 42.35 | +[default7]: iteration 3054/ 6200 | consumed samples: 3127296 | consumed tokens: 6404702208 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.764492E+00 | loss scale: 4096.0 | grad norm: 6.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.628 | TFLOPs: 42.26 | +[default7]: iteration 3055/ 6200 | consumed samples: 3128320 | consumed tokens: 6406799360 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775746E+00 | loss scale: 4096.0 | grad norm: 5.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.630 | TFLOPs: 42.26 | +[default7]: iteration 3056/ 6200 | consumed samples: 3129344 | consumed tokens: 6408896512 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742139E+00 | loss scale: 4096.0 | grad norm: 5.819 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.000 | TFLOPs: 42.37 | +[default7]: iteration 3057/ 6200 | consumed samples: 3130368 | consumed tokens: 6410993664 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.754304E+00 | loss scale: 4096.0 | grad norm: 4.949 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.911 | TFLOPs: 42.34 | +[default7]: iteration 3058/ 6200 | consumed samples: 3131392 | consumed tokens: 6413090816 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765431E+00 | loss scale: 4096.0 | grad norm: 5.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.767 | TFLOPs: 42.30 | +[default7]: iteration 3059/ 6200 | consumed samples: 3132416 | consumed tokens: 6415187968 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724658E+00 | loss scale: 4096.0 | grad norm: 5.764 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.448 | TFLOPs: 42.20 | +[default7]: iteration 3060/ 6200 | consumed samples: 3133440 | consumed tokens: 6417285120 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.766589E+00 | loss scale: 4096.0 | grad norm: 6.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.753 | TFLOPs: 42.29 | +[default7]: iteration 3061/ 6200 | consumed samples: 3134464 | consumed tokens: 6419382272 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778569E+00 | loss scale: 4096.0 | grad norm: 5.927 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.576 | TFLOPs: 42.24 | +[default7]: iteration 3062/ 6200 | consumed samples: 3135488 | consumed tokens: 6421479424 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.760177E+00 | loss scale: 4096.0 | grad norm: 5.672 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.545 | TFLOPs: 42.23 | +[default7]: iteration 3063/ 6200 | consumed samples: 3136512 | consumed tokens: 6423576576 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.767185E+00 | loss scale: 4096.0 | grad norm: 5.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.394 | TFLOPs: 42.18 | +[default7]: iteration 3064/ 6200 | consumed samples: 3137536 | consumed tokens: 6425673728 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.773295E+00 | loss scale: 4096.0 | grad norm: 6.698 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.685 | TFLOPs: 42.27 | +[default7]: iteration 3065/ 6200 | consumed samples: 3138560 | consumed tokens: 6427770880 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733322E+00 | loss scale: 4096.0 | grad norm: 5.932 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.639 | TFLOPs: 42.26 | +[default7]: iteration 3066/ 6200 | consumed samples: 3139584 | consumed tokens: 6429868032 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756510E+00 | loss scale: 4096.0 | grad norm: 5.859 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.426 | TFLOPs: 42.19 | +[default7]: iteration 3067/ 6200 | consumed samples: 3140608 | consumed tokens: 6431965184 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.762755E+00 | loss scale: 4096.0 | grad norm: 6.997 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.492 | TFLOPs: 42.21 | +[default7]: iteration 3068/ 6200 | consumed samples: 3141632 | consumed tokens: 6434062336 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.760773E+00 | loss scale: 4096.0 | grad norm: 6.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.451 | TFLOPs: 42.20 | +[default7]: iteration 3069/ 6200 | consumed samples: 3142656 | consumed tokens: 6436159488 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739442E+00 | loss scale: 4096.0 | grad norm: 6.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.253 | TFLOPs: 42.45 | +[default7]: iteration 3070/ 6200 | consumed samples: 3143680 | consumed tokens: 6438256640 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748116E+00 | loss scale: 4096.0 | grad norm: 6.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.155 | TFLOPs: 42.42 | +[default7]: iteration 3071/ 6200 | consumed samples: 3144704 | consumed tokens: 6440353792 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751585E+00 | loss scale: 4096.0 | grad norm: 5.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.416 | TFLOPs: 42.19 | +[default7]: iteration 3072/ 6200 | consumed samples: 3145728 | consumed tokens: 6442450944 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752698E+00 | loss scale: 4096.0 | grad norm: 5.912 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.027 | TFLOPs: 42.38 | +[default7]: iteration 3073/ 6200 | consumed samples: 3146752 | consumed tokens: 6444548096 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.744493E+00 | loss scale: 4096.0 | grad norm: 5.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.663 | TFLOPs: 42.27 | +[default7]: iteration 3074/ 6200 | consumed samples: 3147776 | consumed tokens: 6446645248 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761305E+00 | loss scale: 4096.0 | grad norm: 5.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.881 | TFLOPs: 42.33 | +[default7]: iteration 3075/ 6200 | consumed samples: 3148800 | consumed tokens: 6448742400 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.763843E+00 | loss scale: 4096.0 | grad norm: 5.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.262 | TFLOPs: 42.45 | +[default7]: iteration 3076/ 6200 | consumed samples: 3149824 | consumed tokens: 6450839552 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738797E+00 | loss scale: 4096.0 | grad norm: 4.835 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.159 | TFLOPs: 42.11 | +[default7]: iteration 3077/ 6200 | consumed samples: 3150848 | consumed tokens: 6452936704 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.790519E+00 | loss scale: 4096.0 | grad norm: 5.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.152 | TFLOPs: 42.11 | +[default7]: iteration 3078/ 6200 | consumed samples: 3151872 | consumed tokens: 6455033856 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.774703E+00 | loss scale: 4096.0 | grad norm: 6.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.109 | TFLOPs: 42.10 | +[default7]: iteration 3079/ 6200 | consumed samples: 3152896 | consumed tokens: 6457131008 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.760159E+00 | loss scale: 4096.0 | grad norm: 5.811 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.386 | TFLOPs: 42.18 | +[default7]: iteration 3080/ 6200 | consumed samples: 3153920 | consumed tokens: 6459228160 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753730E+00 | loss scale: 4096.0 | grad norm: 5.858 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.284 | TFLOPs: 42.45 | +[default7]: iteration 3081/ 6200 | consumed samples: 3154944 | consumed tokens: 6461325312 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.730547E+00 | loss scale: 4096.0 | grad norm: 6.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.353 | TFLOPs: 42.17 | +[default7]: iteration 3082/ 6200 | consumed samples: 3155968 | consumed tokens: 6463422464 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.737298E+00 | loss scale: 4096.0 | grad norm: 5.683 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.412 | TFLOPs: 42.19 | +[default7]: iteration 3083/ 6200 | consumed samples: 3156992 | consumed tokens: 6465519616 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727551E+00 | loss scale: 4096.0 | grad norm: 5.627 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.118 | TFLOPs: 42.40 | +[default7]: iteration 3084/ 6200 | consumed samples: 3158016 | consumed tokens: 6467616768 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.780540E+00 | loss scale: 4096.0 | grad norm: 5.044 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.956 | TFLOPs: 42.35 | +[default7]: iteration 3085/ 6200 | consumed samples: 3159040 | consumed tokens: 6469713920 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750253E+00 | loss scale: 4096.0 | grad norm: 5.898 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.279 | TFLOPs: 42.45 | +[default7]: iteration 3086/ 6200 | consumed samples: 3160064 | consumed tokens: 6471811072 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761824E+00 | loss scale: 4096.0 | grad norm: 5.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.037 | TFLOPs: 42.38 | +[default7]: iteration 3087/ 6200 | consumed samples: 3161088 | consumed tokens: 6473908224 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740844E+00 | loss scale: 4096.0 | grad norm: 5.577 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.739 | TFLOPs: 42.29 | +[default7]: iteration 3088/ 6200 | consumed samples: 3162112 | consumed tokens: 6476005376 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741758E+00 | loss scale: 4096.0 | grad norm: 5.585 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.637 | TFLOPs: 42.26 | +[default7]: iteration 3089/ 6200 | consumed samples: 3163136 | consumed tokens: 6478102528 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.757057E+00 | loss scale: 4096.0 | grad norm: 5.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.792 | TFLOPs: 42.31 | +[default7]: iteration 3090/ 6200 | consumed samples: 3164160 | consumed tokens: 6480199680 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.757822E+00 | loss scale: 4096.0 | grad norm: 6.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.154 | TFLOPs: 42.42 | +[default7]: iteration 3091/ 6200 | consumed samples: 3165184 | consumed tokens: 6482296832 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.755239E+00 | loss scale: 4096.0 | grad norm: 6.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.977 | TFLOPs: 42.36 | +[default7]: iteration 3092/ 6200 | consumed samples: 3166208 | consumed tokens: 6484393984 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772586E+00 | loss scale: 4096.0 | grad norm: 6.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.154 | TFLOPs: 42.42 | +[default7]: iteration 3093/ 6200 | consumed samples: 3167232 | consumed tokens: 6486491136 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714471E+00 | loss scale: 4096.0 | grad norm: 5.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.718 | TFLOPs: 42.28 | +[default7]: iteration 3094/ 6200 | consumed samples: 3168256 | consumed tokens: 6488588288 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.779914E+00 | loss scale: 4096.0 | grad norm: 5.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.124 | TFLOPs: 42.41 | +[default7]: iteration 3095/ 6200 | consumed samples: 3169280 | consumed tokens: 6490685440 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713537E+00 | loss scale: 4096.0 | grad norm: 5.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.118 | TFLOPs: 42.40 | +[default7]: iteration 3096/ 6200 | consumed samples: 3170304 | consumed tokens: 6492782592 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.759638E+00 | loss scale: 4096.0 | grad norm: 6.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.501 | TFLOPs: 42.22 | +[default7]: iteration 3097/ 6200 | consumed samples: 3171328 | consumed tokens: 6494879744 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.759723E+00 | loss scale: 4096.0 | grad norm: 5.881 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.826 | TFLOPs: 42.32 | +[default7]: iteration 3098/ 6200 | consumed samples: 3172352 | consumed tokens: 6496976896 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761365E+00 | loss scale: 4096.0 | grad norm: 5.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.853 | TFLOPs: 42.32 | +[default7]: iteration 3099/ 6200 | consumed samples: 3173376 | consumed tokens: 6499074048 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753631E+00 | loss scale: 4096.0 | grad norm: 6.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.619 | TFLOPs: 42.25 | +[default7]: iteration 3100/ 6200 | consumed samples: 3174400 | consumed tokens: 6501171200 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.757949E+00 | loss scale: 4096.0 | grad norm: 6.021 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.658 | TFLOPs: 42.26 | +[default7]: iteration 3101/ 6200 | consumed samples: 3175424 | consumed tokens: 6503268352 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.730196E+00 | loss scale: 4096.0 | grad norm: 4.962 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.124 | TFLOPs: 42.41 | +[default7]: iteration 3102/ 6200 | consumed samples: 3176448 | consumed tokens: 6505365504 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747419E+00 | loss scale: 4096.0 | grad norm: 5.709 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.688 | TFLOPs: 42.27 | +[default7]: iteration 3103/ 6200 | consumed samples: 3177472 | consumed tokens: 6507462656 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749302E+00 | loss scale: 4096.0 | grad norm: 5.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.093 | TFLOPs: 42.40 | +[default7]: iteration 3104/ 6200 | consumed samples: 3178496 | consumed tokens: 6509559808 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.769176E+00 | loss scale: 4096.0 | grad norm: 5.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.159 | TFLOPs: 42.42 | +[default7]: iteration 3105/ 6200 | consumed samples: 3179520 | consumed tokens: 6511656960 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752756E+00 | loss scale: 4096.0 | grad norm: 5.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.853 | TFLOPs: 42.32 | +[default7]: iteration 3106/ 6200 | consumed samples: 3180544 | consumed tokens: 6513754112 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.773159E+00 | loss scale: 4096.0 | grad norm: 5.062 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.674 | TFLOPs: 42.27 | +[default7]: iteration 3107/ 6200 | consumed samples: 3181568 | consumed tokens: 6515851264 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731646E+00 | loss scale: 4096.0 | grad norm: 5.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.885 | TFLOPs: 42.33 | +[default7]: iteration 3108/ 6200 | consumed samples: 3182592 | consumed tokens: 6517948416 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742424E+00 | loss scale: 4096.0 | grad norm: 5.997 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.880 | TFLOPs: 42.33 | +[default7]: iteration 3109/ 6200 | consumed samples: 3183616 | consumed tokens: 6520045568 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.789809E+00 | loss scale: 4096.0 | grad norm: 4.859 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.853 | TFLOPs: 42.32 | +[default7]: iteration 3110/ 6200 | consumed samples: 3184640 | consumed tokens: 6522142720 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.773660E+00 | loss scale: 4096.0 | grad norm: 5.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.726 | TFLOPs: 42.28 | +[default7]: iteration 3111/ 6200 | consumed samples: 3185664 | consumed tokens: 6524239872 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752846E+00 | loss scale: 4096.0 | grad norm: 5.060 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.474 | TFLOPs: 42.21 | +[default7]: iteration 3112/ 6200 | consumed samples: 3186688 | consumed tokens: 6526337024 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743273E+00 | loss scale: 4096.0 | grad norm: 7.714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.681 | TFLOPs: 42.27 | +[default7]: iteration 3113/ 6200 | consumed samples: 3187712 | consumed tokens: 6528434176 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761969E+00 | loss scale: 4096.0 | grad norm: 6.989 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.771 | TFLOPs: 42.30 | +[default7]: iteration 3114/ 6200 | consumed samples: 3188736 | consumed tokens: 6530531328 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742143E+00 | loss scale: 4096.0 | grad norm: 6.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.432 | TFLOPs: 42.20 | +[default7]: iteration 3115/ 6200 | consumed samples: 3189760 | consumed tokens: 6532628480 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775930E+00 | loss scale: 4096.0 | grad norm: 5.896 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.793 | TFLOPs: 42.31 | +[default7]: iteration 3116/ 6200 | consumed samples: 3190784 | consumed tokens: 6534725632 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.745153E+00 | loss scale: 4096.0 | grad norm: 6.038 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.914 | TFLOPs: 42.34 | +[default7]: iteration 3117/ 6200 | consumed samples: 3191808 | consumed tokens: 6536822784 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743486E+00 | loss scale: 4096.0 | grad norm: 5.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.996 | TFLOPs: 42.37 | +[default7]: iteration 3118/ 6200 | consumed samples: 3192832 | consumed tokens: 6538919936 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756559E+00 | loss scale: 4096.0 | grad norm: 5.894 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.925 | TFLOPs: 42.35 | +[default7]: iteration 3119/ 6200 | consumed samples: 3193856 | consumed tokens: 6541017088 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.762225E+00 | loss scale: 4096.0 | grad norm: 6.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.652 | TFLOPs: 42.26 | +[default7]: iteration 3120/ 6200 | consumed samples: 3194880 | consumed tokens: 6543114240 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.773580E+00 | loss scale: 4096.0 | grad norm: 6.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.743 | TFLOPs: 42.29 | +[default7]: iteration 3121/ 6200 | consumed samples: 3195904 | consumed tokens: 6545211392 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.799402E+00 | loss scale: 4096.0 | grad norm: 4.981 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.989 | TFLOPs: 42.37 | +[default7]: iteration 3122/ 6200 | consumed samples: 3196928 | consumed tokens: 6547308544 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.763016E+00 | loss scale: 4096.0 | grad norm: 5.693 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.813 | TFLOPs: 42.31 | +[default7]: iteration 3123/ 6200 | consumed samples: 3197952 | consumed tokens: 6549405696 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738848E+00 | loss scale: 4096.0 | grad norm: 5.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.611 | TFLOPs: 42.25 | +[default7]: iteration 3124/ 6200 | consumed samples: 3198976 | consumed tokens: 6551502848 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747843E+00 | loss scale: 4096.0 | grad norm: 5.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.940 | TFLOPs: 42.35 | +[default7]: iteration 3125/ 6200 | consumed samples: 3200000 | consumed tokens: 6553600000 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749936E+00 | loss scale: 4096.0 | grad norm: 4.867 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.841 | TFLOPs: 42.32 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 3125 | lm loss value: 3.542447E+00 | lm loss PPL: 3.455136E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 3125 | lm loss value: 1.639829E+00 | lm loss PPL: 5.154288E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 3126/ 6200 | consumed samples: 3201024 | consumed tokens: 6555697152 | elapsed time per iteration (s): 51.83 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.755471E+00 | loss scale: 4096.0 | grad norm: 5.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.757 | TFLOPs: 6.02 | +[default7]: iteration 3127/ 6200 | consumed samples: 3202048 | consumed tokens: 6557794304 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752007E+00 | loss scale: 4096.0 | grad norm: 5.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.657 | TFLOPs: 42.26 | +[default7]: iteration 3128/ 6200 | consumed samples: 3203072 | consumed tokens: 6559891456 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.763445E+00 | loss scale: 4096.0 | grad norm: 5.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.451 | TFLOPs: 42.20 | +[default7]: iteration 3129/ 6200 | consumed samples: 3204096 | consumed tokens: 6561988608 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752084E+00 | loss scale: 4096.0 | grad norm: 6.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.990 | TFLOPs: 42.37 | +[default7]: iteration 3130/ 6200 | consumed samples: 3205120 | consumed tokens: 6564085760 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739434E+00 | loss scale: 4096.0 | grad norm: 5.821 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.856 | TFLOPs: 42.32 | +[default7]: iteration 3131/ 6200 | consumed samples: 3206144 | consumed tokens: 6566182912 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761137E+00 | loss scale: 4096.0 | grad norm: 6.019 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.510 | TFLOPs: 42.22 | +[default7]: iteration 3132/ 6200 | consumed samples: 3207168 | consumed tokens: 6568280064 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743811E+00 | loss scale: 4096.0 | grad norm: 7.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.189 | TFLOPs: 42.12 | +[default7]: iteration 3133/ 6200 | consumed samples: 3208192 | consumed tokens: 6570377216 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753981E+00 | loss scale: 4096.0 | grad norm: 5.593 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.669 | TFLOPs: 42.27 | +[default7]: iteration 3134/ 6200 | consumed samples: 3209216 | consumed tokens: 6572474368 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739339E+00 | loss scale: 4096.0 | grad norm: 4.843 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.542 | TFLOPs: 42.23 | +[default7]: iteration 3135/ 6200 | consumed samples: 3210240 | consumed tokens: 6574571520 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.764230E+00 | loss scale: 4096.0 | grad norm: 6.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.960 | TFLOPs: 42.36 | +[default7]: iteration 3136/ 6200 | consumed samples: 3211264 | consumed tokens: 6576668672 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.781604E+00 | loss scale: 4096.0 | grad norm: 6.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.590 | TFLOPs: 42.24 | +[default7]: iteration 3137/ 6200 | consumed samples: 3212288 | consumed tokens: 6578765824 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.745854E+00 | loss scale: 4096.0 | grad norm: 5.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.957 | TFLOPs: 42.36 | +[default7]: iteration 3138/ 6200 | consumed samples: 3213312 | consumed tokens: 6580862976 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.766398E+00 | loss scale: 4096.0 | grad norm: 6.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.142 | TFLOPs: 42.41 | +[default7]: iteration 3139/ 6200 | consumed samples: 3214336 | consumed tokens: 6582960128 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736220E+00 | loss scale: 4096.0 | grad norm: 5.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.713 | TFLOPs: 42.28 | +[default7]: iteration 3140/ 6200 | consumed samples: 3215360 | consumed tokens: 6585057280 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.758430E+00 | loss scale: 4096.0 | grad norm: 6.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.963 | TFLOPs: 42.36 | +[default7]: iteration 3141/ 6200 | consumed samples: 3216384 | consumed tokens: 6587154432 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756204E+00 | loss scale: 4096.0 | grad norm: 6.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.783 | TFLOPs: 42.30 | +[default7]: iteration 3142/ 6200 | consumed samples: 3217408 | consumed tokens: 6589251584 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749127E+00 | loss scale: 4096.0 | grad norm: 5.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.921 | TFLOPs: 42.34 | +[default7]: iteration 3143/ 6200 | consumed samples: 3218432 | consumed tokens: 6591348736 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770067E+00 | loss scale: 4096.0 | grad norm: 6.028 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.829 | TFLOPs: 42.32 | +[default7]: iteration 3144/ 6200 | consumed samples: 3219456 | consumed tokens: 6593445888 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.730720E+00 | loss scale: 4096.0 | grad norm: 5.917 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.810 | TFLOPs: 42.31 | +[default7]: iteration 3145/ 6200 | consumed samples: 3220480 | consumed tokens: 6595543040 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736526E+00 | loss scale: 4096.0 | grad norm: 6.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.827 | TFLOPs: 42.32 | +[default7]: iteration 3146/ 6200 | consumed samples: 3221504 | consumed tokens: 6597640192 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.745088E+00 | loss scale: 4096.0 | grad norm: 6.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.803 | TFLOPs: 42.31 | +[default7]: iteration 3147/ 6200 | consumed samples: 3222528 | consumed tokens: 6599737344 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.767475E+00 | loss scale: 4096.0 | grad norm: 6.788 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.779 | TFLOPs: 42.30 | +[default7]: iteration 3148/ 6200 | consumed samples: 3223552 | consumed tokens: 6601834496 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753113E+00 | loss scale: 4096.0 | grad norm: 7.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.773 | TFLOPs: 42.30 | +[default7]: iteration 3149/ 6200 | consumed samples: 3224576 | consumed tokens: 6603931648 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733334E+00 | loss scale: 4096.0 | grad norm: 7.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.599 | TFLOPs: 42.25 | +[default7]: iteration 3150/ 6200 | consumed samples: 3225600 | consumed tokens: 6606028800 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740087E+00 | loss scale: 4096.0 | grad norm: 5.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.560 | TFLOPs: 42.23 | +[default7]: iteration 3151/ 6200 | consumed samples: 3226624 | consumed tokens: 6608125952 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736642E+00 | loss scale: 4096.0 | grad norm: 6.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.416 | TFLOPs: 42.19 | +[default7]: iteration 3152/ 6200 | consumed samples: 3227648 | consumed tokens: 6610223104 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.764393E+00 | loss scale: 4096.0 | grad norm: 7.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.466 | TFLOPs: 42.21 | +[default7]: iteration 3153/ 6200 | consumed samples: 3228672 | consumed tokens: 6612320256 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749233E+00 | loss scale: 4096.0 | grad norm: 4.798 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.556 | TFLOPs: 42.23 | +[default7]: iteration 3154/ 6200 | consumed samples: 3229696 | consumed tokens: 6614417408 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743516E+00 | loss scale: 4096.0 | grad norm: 6.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.737 | TFLOPs: 42.29 | +[default7]: iteration 3155/ 6200 | consumed samples: 3230720 | consumed tokens: 6616514560 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731235E+00 | loss scale: 4096.0 | grad norm: 5.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.404 | TFLOPs: 42.19 | +[default7]: iteration 3156/ 6200 | consumed samples: 3231744 | consumed tokens: 6618611712 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733499E+00 | loss scale: 4096.0 | grad norm: 5.983 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.914 | TFLOPs: 42.34 | +[default7]: iteration 3157/ 6200 | consumed samples: 3232768 | consumed tokens: 6620708864 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778434E+00 | loss scale: 4096.0 | grad norm: 5.948 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.619 | TFLOPs: 42.25 | +[default7]: iteration 3158/ 6200 | consumed samples: 3233792 | consumed tokens: 6622806016 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.780291E+00 | loss scale: 4096.0 | grad norm: 5.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.796 | TFLOPs: 42.31 | +[default7]: iteration 3159/ 6200 | consumed samples: 3234816 | consumed tokens: 6624903168 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722802E+00 | loss scale: 4096.0 | grad norm: 5.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.846 | TFLOPs: 42.32 | +[default7]: iteration 3160/ 6200 | consumed samples: 3235840 | consumed tokens: 6627000320 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726714E+00 | loss scale: 4096.0 | grad norm: 5.709 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.800 | TFLOPs: 42.31 | +[default7]: iteration 3161/ 6200 | consumed samples: 3236864 | consumed tokens: 6629097472 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753670E+00 | loss scale: 4096.0 | grad norm: 5.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.982 | TFLOPs: 42.36 | +[default7]: iteration 3162/ 6200 | consumed samples: 3237888 | consumed tokens: 6631194624 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735924E+00 | loss scale: 4096.0 | grad norm: 7.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.490 | TFLOPs: 42.21 | +[default7]: iteration 3163/ 6200 | consumed samples: 3238912 | consumed tokens: 6633291776 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739576E+00 | loss scale: 4096.0 | grad norm: 5.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.954 | TFLOPs: 42.35 | +[default7]: iteration 3164/ 6200 | consumed samples: 3239936 | consumed tokens: 6635388928 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746262E+00 | loss scale: 4096.0 | grad norm: 6.003 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.932 | TFLOPs: 42.35 | +[default7]: iteration 3165/ 6200 | consumed samples: 3240960 | consumed tokens: 6637486080 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.759062E+00 | loss scale: 4096.0 | grad norm: 6.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.046 | TFLOPs: 42.08 | +[default7]: iteration 3166/ 6200 | consumed samples: 3241984 | consumed tokens: 6639583232 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.755790E+00 | loss scale: 4096.0 | grad norm: 6.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.320 | TFLOPs: 42.16 | +[default7]: iteration 3167/ 6200 | consumed samples: 3243008 | consumed tokens: 6641680384 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.758672E+00 | loss scale: 4096.0 | grad norm: 5.734 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.479 | TFLOPs: 42.21 | +[default7]: iteration 3168/ 6200 | consumed samples: 3244032 | consumed tokens: 6643777536 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727673E+00 | loss scale: 4096.0 | grad norm: 6.692 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.722 | TFLOPs: 42.28 | +[default7]: iteration 3169/ 6200 | consumed samples: 3245056 | consumed tokens: 6645874688 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743032E+00 | loss scale: 4096.0 | grad norm: 6.740 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.424 | TFLOPs: 42.19 | +[default7]: iteration 3170/ 6200 | consumed samples: 3246080 | consumed tokens: 6647971840 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739254E+00 | loss scale: 4096.0 | grad norm: 6.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.157 | TFLOPs: 42.11 | +[default7]: iteration 3171/ 6200 | consumed samples: 3247104 | consumed tokens: 6650068992 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756214E+00 | loss scale: 4096.0 | grad norm: 5.629 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.457 | TFLOPs: 42.20 | +[default7]: iteration 3172/ 6200 | consumed samples: 3248128 | consumed tokens: 6652166144 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739108E+00 | loss scale: 4096.0 | grad norm: 5.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.189 | TFLOPs: 42.12 | +[default7]: iteration 3173/ 6200 | consumed samples: 3249152 | consumed tokens: 6654263296 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.769708E+00 | loss scale: 4096.0 | grad norm: 6.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.318 | TFLOPs: 42.16 | +[default7]: iteration 3174/ 6200 | consumed samples: 3250176 | consumed tokens: 6656360448 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726630E+00 | loss scale: 4096.0 | grad norm: 5.897 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.068 | TFLOPs: 42.08 | +[default7]: iteration 3175/ 6200 | consumed samples: 3251200 | consumed tokens: 6658457600 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749887E+00 | loss scale: 4096.0 | grad norm: 6.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.350 | TFLOPs: 42.17 | +[default7]: iteration 3176/ 6200 | consumed samples: 3252224 | consumed tokens: 6660554752 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719981E+00 | loss scale: 4096.0 | grad norm: 5.680 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.911 | TFLOPs: 42.04 | +[default7]: iteration 3177/ 6200 | consumed samples: 3253248 | consumed tokens: 6662651904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770007E+00 | loss scale: 4096.0 | grad norm: 5.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.719 | TFLOPs: 42.28 | +[default7]: iteration 3178/ 6200 | consumed samples: 3254272 | consumed tokens: 6664749056 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.775887E+00 | loss scale: 4096.0 | grad norm: 4.929 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.119 | TFLOPs: 42.40 | +[default7]: iteration 3179/ 6200 | consumed samples: 3255296 | consumed tokens: 6666846208 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749934E+00 | loss scale: 4096.0 | grad norm: 5.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.715 | TFLOPs: 42.28 | +[default7]: iteration 3180/ 6200 | consumed samples: 3256320 | consumed tokens: 6668943360 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735756E+00 | loss scale: 4096.0 | grad norm: 5.756 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.651 | TFLOPs: 42.26 | +[default7]: iteration 3181/ 6200 | consumed samples: 3257344 | consumed tokens: 6671040512 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.758630E+00 | loss scale: 4096.0 | grad norm: 6.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.119 | TFLOPs: 42.40 | +[default7]: iteration 3182/ 6200 | consumed samples: 3258368 | consumed tokens: 6673137664 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748303E+00 | loss scale: 4096.0 | grad norm: 5.954 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.084 | TFLOPs: 42.39 | +[default7]: iteration 3183/ 6200 | consumed samples: 3259392 | consumed tokens: 6675234816 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751500E+00 | loss scale: 4096.0 | grad norm: 5.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.272 | TFLOPs: 42.45 | +[default7]: iteration 3184/ 6200 | consumed samples: 3260416 | consumed tokens: 6677331968 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722951E+00 | loss scale: 4096.0 | grad norm: 5.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.281 | TFLOPs: 42.45 | +[default7]: iteration 3185/ 6200 | consumed samples: 3261440 | consumed tokens: 6679429120 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.759703E+00 | loss scale: 4096.0 | grad norm: 5.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.150 | TFLOPs: 42.41 | +[default7]: iteration 3186/ 6200 | consumed samples: 3262464 | consumed tokens: 6681526272 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747904E+00 | loss scale: 4096.0 | grad norm: 5.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.885 | TFLOPs: 42.33 | +[default7]: iteration 3187/ 6200 | consumed samples: 3263488 | consumed tokens: 6683623424 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728134E+00 | loss scale: 4096.0 | grad norm: 5.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.940 | TFLOPs: 42.35 | +[default7]: iteration 3188/ 6200 | consumed samples: 3264512 | consumed tokens: 6685720576 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765151E+00 | loss scale: 4096.0 | grad norm: 5.858 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.274 | TFLOPs: 42.45 | +[default7]: iteration 3189/ 6200 | consumed samples: 3265536 | consumed tokens: 6687817728 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741933E+00 | loss scale: 4096.0 | grad norm: 5.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.260 | TFLOPs: 42.45 | +[default7]: iteration 3190/ 6200 | consumed samples: 3266560 | consumed tokens: 6689914880 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.766072E+00 | loss scale: 4096.0 | grad norm: 5.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.159 | TFLOPs: 42.42 | +[default7]: iteration 3191/ 6200 | consumed samples: 3267584 | consumed tokens: 6692012032 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742595E+00 | loss scale: 4096.0 | grad norm: 5.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.269 | TFLOPs: 42.45 | +[default7]: iteration 3192/ 6200 | consumed samples: 3268608 | consumed tokens: 6694109184 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.758666E+00 | loss scale: 4096.0 | grad norm: 6.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.464 | TFLOPs: 42.51 | +[default7]: iteration 3193/ 6200 | consumed samples: 3269632 | consumed tokens: 6696206336 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.764779E+00 | loss scale: 4096.0 | grad norm: 5.641 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.193 | TFLOPs: 42.43 | +[default7]: iteration 3194/ 6200 | consumed samples: 3270656 | consumed tokens: 6698303488 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.737996E+00 | loss scale: 4096.0 | grad norm: 7.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.064 | TFLOPs: 42.39 | +[default7]: iteration 3195/ 6200 | consumed samples: 3271680 | consumed tokens: 6700400640 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.754102E+00 | loss scale: 4096.0 | grad norm: 6.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.119 | TFLOPs: 42.40 | +[default7]: iteration 3196/ 6200 | consumed samples: 3272704 | consumed tokens: 6702497792 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743889E+00 | loss scale: 4096.0 | grad norm: 5.687 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.090 | TFLOPs: 42.40 | +[default7]: iteration 3197/ 6200 | consumed samples: 3273728 | consumed tokens: 6704594944 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752561E+00 | loss scale: 4096.0 | grad norm: 5.796 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.114 | TFLOPs: 42.40 | +[default7]: iteration 3198/ 6200 | consumed samples: 3274752 | consumed tokens: 6706692096 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.769759E+00 | loss scale: 4096.0 | grad norm: 5.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.029 | TFLOPs: 42.38 | +[default7]: iteration 3199/ 6200 | consumed samples: 3275776 | consumed tokens: 6708789248 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747534E+00 | loss scale: 4096.0 | grad norm: 6.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.122 | TFLOPs: 42.41 | +[default7]: iteration 3200/ 6200 | consumed samples: 3276800 | consumed tokens: 6710886400 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715992E+00 | loss scale: 4096.0 | grad norm: 5.594 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.178 | TFLOPs: 42.42 | +[default7]: iteration 3201/ 6200 | consumed samples: 3277824 | consumed tokens: 6712983552 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742431E+00 | loss scale: 4096.0 | grad norm: 5.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.300 | TFLOPs: 42.46 | +[default7]: iteration 3202/ 6200 | consumed samples: 3278848 | consumed tokens: 6715080704 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750066E+00 | loss scale: 4096.0 | grad norm: 6.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.287 | TFLOPs: 42.46 | +[default7]: iteration 3203/ 6200 | consumed samples: 3279872 | consumed tokens: 6717177856 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756067E+00 | loss scale: 4096.0 | grad norm: 5.634 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.787 | TFLOPs: 42.30 | +[default7]: iteration 3204/ 6200 | consumed samples: 3280896 | consumed tokens: 6719275008 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748878E+00 | loss scale: 4096.0 | grad norm: 5.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.671 | TFLOPs: 42.27 | +[default7]: iteration 3205/ 6200 | consumed samples: 3281920 | consumed tokens: 6721372160 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.769549E+00 | loss scale: 4096.0 | grad norm: 5.819 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.814 | TFLOPs: 42.31 | +[default7]: iteration 3206/ 6200 | consumed samples: 3282944 | consumed tokens: 6723469312 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748194E+00 | loss scale: 4096.0 | grad norm: 5.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.053 | TFLOPs: 42.38 | +[default7]: iteration 3207/ 6200 | consumed samples: 3283968 | consumed tokens: 6725566464 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747700E+00 | loss scale: 4096.0 | grad norm: 6.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.309 | TFLOPs: 42.46 | +[default7]: iteration 3208/ 6200 | consumed samples: 3284992 | consumed tokens: 6727663616 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741316E+00 | loss scale: 4096.0 | grad norm: 6.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.169 | TFLOPs: 42.42 | +[default7]: iteration 3209/ 6200 | consumed samples: 3286016 | consumed tokens: 6729760768 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.754299E+00 | loss scale: 4096.0 | grad norm: 5.018 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.123 | TFLOPs: 42.41 | +[default7]: iteration 3210/ 6200 | consumed samples: 3287040 | consumed tokens: 6731857920 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.754164E+00 | loss scale: 4096.0 | grad norm: 5.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.033 | TFLOPs: 42.38 | +[default7]: iteration 3211/ 6200 | consumed samples: 3288064 | consumed tokens: 6733955072 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753191E+00 | loss scale: 4096.0 | grad norm: 4.825 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.082 | TFLOPs: 42.39 | +[default7]: iteration 3212/ 6200 | consumed samples: 3289088 | consumed tokens: 6736052224 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738952E+00 | loss scale: 4096.0 | grad norm: 5.771 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.068 | TFLOPs: 42.39 | +[default7]: iteration 3213/ 6200 | consumed samples: 3290112 | consumed tokens: 6738149376 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.759719E+00 | loss scale: 4096.0 | grad norm: 6.748 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.121 | TFLOPs: 42.41 | +[default7]: iteration 3214/ 6200 | consumed samples: 3291136 | consumed tokens: 6740246528 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746672E+00 | loss scale: 4096.0 | grad norm: 7.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.874 | TFLOPs: 42.33 | +[default7]: iteration 3215/ 6200 | consumed samples: 3292160 | consumed tokens: 6742343680 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753286E+00 | loss scale: 4096.0 | grad norm: 6.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.943 | TFLOPs: 42.35 | +[default7]: iteration 3216/ 6200 | consumed samples: 3293184 | consumed tokens: 6744440832 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719827E+00 | loss scale: 4096.0 | grad norm: 5.780 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.124 | TFLOPs: 42.41 | +[default7]: iteration 3217/ 6200 | consumed samples: 3294208 | consumed tokens: 6746537984 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743560E+00 | loss scale: 4096.0 | grad norm: 6.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.074 | TFLOPs: 42.39 | +[default7]: iteration 3218/ 6200 | consumed samples: 3295232 | consumed tokens: 6748635136 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747626E+00 | loss scale: 4096.0 | grad norm: 6.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.026 | TFLOPs: 42.38 | +[default7]: iteration 3219/ 6200 | consumed samples: 3296256 | consumed tokens: 6750732288 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752185E+00 | loss scale: 4096.0 | grad norm: 5.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.337 | TFLOPs: 42.47 | +[default7]: iteration 3220/ 6200 | consumed samples: 3297280 | consumed tokens: 6752829440 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719513E+00 | loss scale: 4096.0 | grad norm: 5.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.441 | TFLOPs: 42.50 | +[default7]: iteration 3221/ 6200 | consumed samples: 3298304 | consumed tokens: 6754926592 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724075E+00 | loss scale: 4096.0 | grad norm: 4.909 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.141 | TFLOPs: 42.41 | +[default7]: iteration 3222/ 6200 | consumed samples: 3299328 | consumed tokens: 6757023744 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733051E+00 | loss scale: 4096.0 | grad norm: 5.600 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.976 | TFLOPs: 42.36 | +[default7]: iteration 3223/ 6200 | consumed samples: 3300352 | consumed tokens: 6759120896 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736530E+00 | loss scale: 4096.0 | grad norm: 5.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.007 | TFLOPs: 42.37 | +[default7]: iteration 3224/ 6200 | consumed samples: 3301376 | consumed tokens: 6761218048 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751600E+00 | loss scale: 4096.0 | grad norm: 5.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.622 | TFLOPs: 42.25 | +[default7]: iteration 3225/ 6200 | consumed samples: 3302400 | consumed tokens: 6763315200 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753826E+00 | loss scale: 4096.0 | grad norm: 5.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.100 | TFLOPs: 42.40 | +[default7]: iteration 3226/ 6200 | consumed samples: 3303424 | consumed tokens: 6765412352 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738127E+00 | loss scale: 4096.0 | grad norm: 5.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.124 | TFLOPs: 42.41 | +[default7]: iteration 3227/ 6200 | consumed samples: 3304448 | consumed tokens: 6767509504 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738389E+00 | loss scale: 4096.0 | grad norm: 6.791 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.997 | TFLOPs: 42.37 | +[default7]: iteration 3228/ 6200 | consumed samples: 3305472 | consumed tokens: 6769606656 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726655E+00 | loss scale: 4096.0 | grad norm: 6.831 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.057 | TFLOPs: 42.39 | +[default7]: iteration 3229/ 6200 | consumed samples: 3306496 | consumed tokens: 6771703808 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731520E+00 | loss scale: 4096.0 | grad norm: 6.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.127 | TFLOPs: 42.41 | +[default7]: iteration 3230/ 6200 | consumed samples: 3307520 | consumed tokens: 6773800960 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750080E+00 | loss scale: 4096.0 | grad norm: 5.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.676 | TFLOPs: 42.27 | +[default7]: iteration 3231/ 6200 | consumed samples: 3308544 | consumed tokens: 6775898112 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727517E+00 | loss scale: 4096.0 | grad norm: 6.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.328 | TFLOPs: 42.16 | +[default7]: iteration 3232/ 6200 | consumed samples: 3309568 | consumed tokens: 6777995264 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.759739E+00 | loss scale: 4096.0 | grad norm: 7.022 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.323 | TFLOPs: 42.16 | +[default7]: iteration 3233/ 6200 | consumed samples: 3310592 | consumed tokens: 6780092416 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.737575E+00 | loss scale: 4096.0 | grad norm: 5.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.624 | TFLOPs: 42.25 | +[default7]: iteration 3234/ 6200 | consumed samples: 3311616 | consumed tokens: 6782189568 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.744978E+00 | loss scale: 4096.0 | grad norm: 5.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.980 | TFLOPs: 42.36 | +[default7]: iteration 3235/ 6200 | consumed samples: 3312640 | consumed tokens: 6784286720 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.755953E+00 | loss scale: 4096.0 | grad norm: 5.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.573 | TFLOPs: 42.24 | +[default7]: iteration 3236/ 6200 | consumed samples: 3313664 | consumed tokens: 6786383872 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719340E+00 | loss scale: 4096.0 | grad norm: 6.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.252 | TFLOPs: 42.14 | +[default7]: iteration 3237/ 6200 | consumed samples: 3314688 | consumed tokens: 6788481024 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747425E+00 | loss scale: 4096.0 | grad norm: 6.881 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.456 | TFLOPs: 42.20 | +[default7]: iteration 3238/ 6200 | consumed samples: 3315712 | consumed tokens: 6790578176 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.780147E+00 | loss scale: 4096.0 | grad norm: 6.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.255 | TFLOPs: 42.14 | +[default7]: iteration 3239/ 6200 | consumed samples: 3316736 | consumed tokens: 6792675328 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722204E+00 | loss scale: 4096.0 | grad norm: 7.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.706 | TFLOPs: 42.28 | +[default7]: iteration 3240/ 6200 | consumed samples: 3317760 | consumed tokens: 6794772480 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.744914E+00 | loss scale: 4096.0 | grad norm: 7.583 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.146 | TFLOPs: 42.11 | +[default7]: iteration 3241/ 6200 | consumed samples: 3318784 | consumed tokens: 6796869632 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.759520E+00 | loss scale: 4096.0 | grad norm: 5.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.429 | TFLOPs: 42.19 | +[default7]: iteration 3242/ 6200 | consumed samples: 3319808 | consumed tokens: 6798966784 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756562E+00 | loss scale: 4096.0 | grad norm: 5.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.487 | TFLOPs: 42.21 | +[default7]: iteration 3243/ 6200 | consumed samples: 3320832 | consumed tokens: 6801063936 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728541E+00 | loss scale: 4096.0 | grad norm: 7.971 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.698 | TFLOPs: 42.28 | +[default7]: iteration 3244/ 6200 | consumed samples: 3321856 | consumed tokens: 6803161088 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.778368E+00 | loss scale: 4096.0 | grad norm: 6.783 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.741 | TFLOPs: 42.29 | +[default7]: iteration 3245/ 6200 | consumed samples: 3322880 | consumed tokens: 6805258240 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706039E+00 | loss scale: 4096.0 | grad norm: 5.756 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.875 | TFLOPs: 42.33 | +[default7]: iteration 3246/ 6200 | consumed samples: 3323904 | consumed tokens: 6807355392 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.758161E+00 | loss scale: 4096.0 | grad norm: 6.002 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.077 | TFLOPs: 42.39 | +[default7]: iteration 3247/ 6200 | consumed samples: 3324928 | consumed tokens: 6809452544 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761232E+00 | loss scale: 4096.0 | grad norm: 5.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.753 | TFLOPs: 42.29 | +[default7]: iteration 3248/ 6200 | consumed samples: 3325952 | consumed tokens: 6811549696 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.769349E+00 | loss scale: 4096.0 | grad norm: 7.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.746 | TFLOPs: 42.29 | +[default7]: iteration 3249/ 6200 | consumed samples: 3326976 | consumed tokens: 6813646848 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748888E+00 | loss scale: 4096.0 | grad norm: 6.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.994 | TFLOPs: 42.37 | +[default7]: iteration 3250/ 6200 | consumed samples: 3328000 | consumed tokens: 6815744000 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731703E+00 | loss scale: 4096.0 | grad norm: 7.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.627 | TFLOPs: 42.25 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 3250 | lm loss value: 3.548656E+00 | lm loss PPL: 3.476658E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 3250 | lm loss value: 1.633399E+00 | lm loss PPL: 5.121254E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 3250 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 19:43:11,456] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3250 is begin to save! +[default0]:[2022-10-06 19:43:11,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:11,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:11,877] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:11,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:11,909] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:11,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:11,938] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:11,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:11,965] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:11,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:11,993] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,020] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,105] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,132] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,163] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,190] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,218] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,275] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,304] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,332] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,481] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,510] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_25-model_00-model_states.pt... +[default7]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 19:43:12,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,537] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,566] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 19:43:12,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 19:43:12,569] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/mp_rank_00_model_states.pt +[default0]:[2022-10-06 19:43:12,569] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 19:43:12,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 19:43:12,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 19:43:12,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 19:43:12,848] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 19:43:12,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 19:43:12,783] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 19:43:12,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 19:43:12,842] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 19:43:12,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 19:43:12,807] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 19:43:12,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 19:43:12,870] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 19:43:12,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 19:43:12,864] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 19:43:12,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 19:43:12,807] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 19:43:12,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 19:43:12,885] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 19:43:12,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 19:43:12,884] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 19:43:12,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 19:43:12,885] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 19:43:12,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 19:43:12,892] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 19:43:12,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 19:43:12,878] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 19:43:12,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 19:43:12,887] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 19:43:12,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 19:43:12,895] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 19:43:12,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 19:43:12,855] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 19:43:12,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 19:43:12,955] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 19:43:12,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 19:43:12,926] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 19:43:12,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 19:43:12,916] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 19:43:12,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 19:43:12,953] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 19:43:12,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 19:43:12,927] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 19:43:12,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 19:43:12,937] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2022-10-06 19:43:12,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 19:43:12,912] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2022-10-06 19:43:12,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 19:43:12,902] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2022-10-06 19:43:12,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 19:43:12,926] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2022-10-06 19:43:12,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 19:43:12,963] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2022-10-06 19:43:12,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 19:43:12,926] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2022-10-06 19:43:12,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 19:43:12,911] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2022-10-06 19:43:12,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 19:43:12,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 19:43:12,944] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2022-10-06 19:43:12,943] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2022-10-06 19:43:12,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 19:43:12,924] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]: successfully saved checkpoint at iteration 3250 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default2]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2022-10-06 19:43:12,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2022-10-06 19:43:12,958] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 19:43:12,967] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3250/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:time (ms) | save-checkpoint: 1512.60 +[default4]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2022-10-06 19:43:12,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]: iteration 3251/ 6200 | consumed samples: 3329024 | consumed tokens: 6817841152 | elapsed time per iteration (s): 53.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748170E+00 | loss scale: 4096.0 | grad norm: 5.644 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.158 | TFLOPs: 5.84 | +[default7]: iteration 3252/ 6200 | consumed samples: 3330048 | consumed tokens: 6819938304 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742651E+00 | loss scale: 4096.0 | grad norm: 5.696 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.920 | TFLOPs: 42.34 | +[default7]: iteration 3253/ 6200 | consumed samples: 3331072 | consumed tokens: 6822035456 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753222E+00 | loss scale: 4096.0 | grad norm: 5.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.929 | TFLOPs: 42.35 | +[default7]: iteration 3254/ 6200 | consumed samples: 3332096 | consumed tokens: 6824132608 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718212E+00 | loss scale: 4096.0 | grad norm: 5.057 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.757 | TFLOPs: 42.29 | +[default7]: iteration 3255/ 6200 | consumed samples: 3333120 | consumed tokens: 6826229760 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719070E+00 | loss scale: 4096.0 | grad norm: 5.658 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.836 | TFLOPs: 42.32 | +[default7]: iteration 3256/ 6200 | consumed samples: 3334144 | consumed tokens: 6828326912 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.729208E+00 | loss scale: 4096.0 | grad norm: 6.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.660 | TFLOPs: 42.26 | +[default7]: iteration 3257/ 6200 | consumed samples: 3335168 | consumed tokens: 6830424064 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728754E+00 | loss scale: 4096.0 | grad norm: 5.907 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.743 | TFLOPs: 42.29 | +[default7]: iteration 3258/ 6200 | consumed samples: 3336192 | consumed tokens: 6832521216 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.760604E+00 | loss scale: 4096.0 | grad norm: 5.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.850 | TFLOPs: 42.32 | +[default7]: iteration 3259/ 6200 | consumed samples: 3337216 | consumed tokens: 6834618368 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.755152E+00 | loss scale: 4096.0 | grad norm: 5.946 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.038 | TFLOPs: 42.38 | +[default7]: iteration 3260/ 6200 | consumed samples: 3338240 | consumed tokens: 6836715520 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746081E+00 | loss scale: 4096.0 | grad norm: 5.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.513 | TFLOPs: 42.22 | +[default7]: iteration 3261/ 6200 | consumed samples: 3339264 | consumed tokens: 6838812672 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.762667E+00 | loss scale: 4096.0 | grad norm: 6.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.872 | TFLOPs: 42.33 | +[default7]: iteration 3262/ 6200 | consumed samples: 3340288 | consumed tokens: 6840909824 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718232E+00 | loss scale: 4096.0 | grad norm: 7.049 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.779 | TFLOPs: 42.30 | +[default7]: iteration 3263/ 6200 | consumed samples: 3341312 | consumed tokens: 6843006976 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721941E+00 | loss scale: 4096.0 | grad norm: 6.847 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.596 | TFLOPs: 42.25 | +[default7]: iteration 3264/ 6200 | consumed samples: 3342336 | consumed tokens: 6845104128 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740351E+00 | loss scale: 4096.0 | grad norm: 6.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.357 | TFLOPs: 42.17 | +[default7]: iteration 3265/ 6200 | consumed samples: 3343360 | consumed tokens: 6847201280 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.737896E+00 | loss scale: 4096.0 | grad norm: 6.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.384 | TFLOPs: 41.88 | +[default7]: iteration 3266/ 6200 | consumed samples: 3344384 | consumed tokens: 6849298432 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739138E+00 | loss scale: 4096.0 | grad norm: 5.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.909 | TFLOPs: 42.34 | +[default7]: iteration 3267/ 6200 | consumed samples: 3345408 | consumed tokens: 6851395584 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.734888E+00 | loss scale: 4096.0 | grad norm: 6.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.443 | TFLOPs: 42.20 | +[default7]: iteration 3268/ 6200 | consumed samples: 3346432 | consumed tokens: 6853492736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738448E+00 | loss scale: 4096.0 | grad norm: 5.756 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.570 | TFLOPs: 42.24 | +[default7]: iteration 3269/ 6200 | consumed samples: 3347456 | consumed tokens: 6855589888 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705135E+00 | loss scale: 4096.0 | grad norm: 5.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.876 | TFLOPs: 42.33 | +[default7]: iteration 3270/ 6200 | consumed samples: 3348480 | consumed tokens: 6857687040 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.760846E+00 | loss scale: 4096.0 | grad norm: 5.588 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.116 | TFLOPs: 42.40 | +[default7]: iteration 3271/ 6200 | consumed samples: 3349504 | consumed tokens: 6859784192 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.732883E+00 | loss scale: 4096.0 | grad norm: 5.813 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.548 | TFLOPs: 42.23 | +[default7]: iteration 3272/ 6200 | consumed samples: 3350528 | consumed tokens: 6861881344 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711197E+00 | loss scale: 4096.0 | grad norm: 5.664 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.872 | TFLOPs: 42.33 | +[default7]: iteration 3273/ 6200 | consumed samples: 3351552 | consumed tokens: 6863978496 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727689E+00 | loss scale: 4096.0 | grad norm: 5.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.836 | TFLOPs: 42.32 | +[default7]: iteration 3274/ 6200 | consumed samples: 3352576 | consumed tokens: 6866075648 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723473E+00 | loss scale: 4096.0 | grad norm: 6.002 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.080 | TFLOPs: 42.39 | +[default7]: iteration 3275/ 6200 | consumed samples: 3353600 | consumed tokens: 6868172800 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743883E+00 | loss scale: 4096.0 | grad norm: 4.879 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.938 | TFLOPs: 42.35 | +[default7]: iteration 3276/ 6200 | consumed samples: 3354624 | consumed tokens: 6870269952 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765409E+00 | loss scale: 4096.0 | grad norm: 5.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.036 | TFLOPs: 42.38 | +[default7]: iteration 3277/ 6200 | consumed samples: 3355648 | consumed tokens: 6872367104 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741021E+00 | loss scale: 4096.0 | grad norm: 5.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.107 | TFLOPs: 42.40 | +[default7]: iteration 3278/ 6200 | consumed samples: 3356672 | consumed tokens: 6874464256 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704922E+00 | loss scale: 4096.0 | grad norm: 4.740 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.111 | TFLOPs: 42.40 | +[default7]: iteration 3279/ 6200 | consumed samples: 3357696 | consumed tokens: 6876561408 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747798E+00 | loss scale: 4096.0 | grad norm: 5.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.117 | TFLOPs: 42.40 | +[default7]: iteration 3280/ 6200 | consumed samples: 3358720 | consumed tokens: 6878658560 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750657E+00 | loss scale: 4096.0 | grad norm: 5.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.432 | TFLOPs: 42.20 | +[default7]: iteration 3281/ 6200 | consumed samples: 3359744 | consumed tokens: 6880755712 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743942E+00 | loss scale: 4096.0 | grad norm: 5.892 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.630 | TFLOPs: 42.26 | +[default7]: iteration 3282/ 6200 | consumed samples: 3360768 | consumed tokens: 6882852864 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728153E+00 | loss scale: 4096.0 | grad norm: 5.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.015 | TFLOPs: 42.37 | +[default7]: iteration 3283/ 6200 | consumed samples: 3361792 | consumed tokens: 6884950016 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753337E+00 | loss scale: 4096.0 | grad norm: 5.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.009 | TFLOPs: 42.37 | +[default7]: iteration 3284/ 6200 | consumed samples: 3362816 | consumed tokens: 6887047168 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740085E+00 | loss scale: 4096.0 | grad norm: 4.862 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.147 | TFLOPs: 42.41 | +[default7]: iteration 3285/ 6200 | consumed samples: 3363840 | consumed tokens: 6889144320 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741808E+00 | loss scale: 4096.0 | grad norm: 5.908 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.112 | TFLOPs: 42.40 | +[default7]: iteration 3286/ 6200 | consumed samples: 3364864 | consumed tokens: 6891241472 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765483E+00 | loss scale: 4096.0 | grad norm: 5.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.033 | TFLOPs: 42.38 | +[default7]: iteration 3287/ 6200 | consumed samples: 3365888 | consumed tokens: 6893338624 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751493E+00 | loss scale: 4096.0 | grad norm: 5.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.568 | TFLOPs: 42.24 | +[default7]: iteration 3288/ 6200 | consumed samples: 3366912 | consumed tokens: 6895435776 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747414E+00 | loss scale: 4096.0 | grad norm: 7.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.431 | TFLOPs: 42.19 | +[default7]: iteration 3289/ 6200 | consumed samples: 3367936 | consumed tokens: 6897532928 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705070E+00 | loss scale: 4096.0 | grad norm: 5.826 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.123 | TFLOPs: 42.10 | +[default7]: iteration 3290/ 6200 | consumed samples: 3368960 | consumed tokens: 6899630080 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722760E+00 | loss scale: 4096.0 | grad norm: 4.897 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.628 | TFLOPs: 42.25 | +[default7]: iteration 3291/ 6200 | consumed samples: 3369984 | consumed tokens: 6901727232 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.783157E+00 | loss scale: 4096.0 | grad norm: 6.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.661 | TFLOPs: 42.27 | +[default7]: iteration 3292/ 6200 | consumed samples: 3371008 | consumed tokens: 6903824384 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.761364E+00 | loss scale: 8192.0 | grad norm: 2.673 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.501 | TFLOPs: 42.22 | +[default7]: iteration 3293/ 6200 | consumed samples: 3372032 | consumed tokens: 6905921536 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740193E+00 | loss scale: 8192.0 | grad norm: 5.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.446 | TFLOPs: 42.20 | +[default7]: iteration 3294/ 6200 | consumed samples: 3373056 | consumed tokens: 6908018688 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750776E+00 | loss scale: 8192.0 | grad norm: 5.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.279 | TFLOPs: 42.15 | +[default0]:[2022-10-06 19:48:44,986] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192.0, reducing to 8192.0 +[default7]: iteration 3295/ 6200 | consumed samples: 3374080 | consumed tokens: 6910115840 | elapsed time per iteration (s): 7.26 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.737665E+00 | loss scale: 8192.0 | grad norm: 5.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 141.056 | TFLOPs: 43.00 | +[default7]: iteration 3296/ 6200 | consumed samples: 3375104 | consumed tokens: 6912212992 | elapsed time per iteration (s): 7.26 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726375E+00 | loss scale: 4096.0 | grad norm: 5.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 141.023 | TFLOPs: 42.99 | +[default0]:[2022-10-06 19:48:52,247] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0 +[default7]: iteration 3297/ 6200 | consumed samples: 3376128 | consumed tokens: 6914310144 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748579E+00 | loss scale: 4096.0 | grad norm: 5.814 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.865 | TFLOPs: 42.33 | +[default7]: iteration 3298/ 6200 | consumed samples: 3377152 | consumed tokens: 6916407296 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714909E+00 | loss scale: 4096.0 | grad norm: 5.604 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.156 | TFLOPs: 42.42 | +[default7]: iteration 3299/ 6200 | consumed samples: 3378176 | consumed tokens: 6918504448 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.737591E+00 | loss scale: 4096.0 | grad norm: 5.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.735 | TFLOPs: 42.29 | +[default7]: iteration 3300/ 6200 | consumed samples: 3379200 | consumed tokens: 6920601600 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710652E+00 | loss scale: 4096.0 | grad norm: 5.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.999 | TFLOPs: 42.37 | +[default7]: iteration 3301/ 6200 | consumed samples: 3380224 | consumed tokens: 6922698752 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749534E+00 | loss scale: 4096.0 | grad norm: 5.903 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.026 | TFLOPs: 42.38 | +[default7]: iteration 3302/ 6200 | consumed samples: 3381248 | consumed tokens: 6924795904 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746755E+00 | loss scale: 4096.0 | grad norm: 5.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.920 | TFLOPs: 42.34 | +[default7]: iteration 3303/ 6200 | consumed samples: 3382272 | consumed tokens: 6926893056 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750188E+00 | loss scale: 4096.0 | grad norm: 5.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.916 | TFLOPs: 42.34 | +[default7]: iteration 3304/ 6200 | consumed samples: 3383296 | consumed tokens: 6928990208 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746050E+00 | loss scale: 4096.0 | grad norm: 6.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.339 | TFLOPs: 42.17 | +[default7]: iteration 3305/ 6200 | consumed samples: 3384320 | consumed tokens: 6931087360 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714238E+00 | loss scale: 4096.0 | grad norm: 5.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.015 | TFLOPs: 42.37 | +[default7]: iteration 3306/ 6200 | consumed samples: 3385344 | consumed tokens: 6933184512 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714393E+00 | loss scale: 4096.0 | grad norm: 6.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.092 | TFLOPs: 42.40 | +[default7]: iteration 3307/ 6200 | consumed samples: 3386368 | consumed tokens: 6935281664 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741252E+00 | loss scale: 4096.0 | grad norm: 5.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.061 | TFLOPs: 42.39 | +[default7]: iteration 3308/ 6200 | consumed samples: 3387392 | consumed tokens: 6937378816 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740098E+00 | loss scale: 4096.0 | grad norm: 5.997 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.905 | TFLOPs: 42.34 | +[default7]: iteration 3309/ 6200 | consumed samples: 3388416 | consumed tokens: 6939475968 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735905E+00 | loss scale: 4096.0 | grad norm: 7.063 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.278 | TFLOPs: 42.45 | +[default7]: iteration 3310/ 6200 | consumed samples: 3389440 | consumed tokens: 6941573120 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756638E+00 | loss scale: 4096.0 | grad norm: 5.724 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.843 | TFLOPs: 42.32 | +[default7]: iteration 3311/ 6200 | consumed samples: 3390464 | consumed tokens: 6943670272 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.766426E+00 | loss scale: 4096.0 | grad norm: 6.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.497 | TFLOPs: 42.21 | +[default7]: iteration 3312/ 6200 | consumed samples: 3391488 | consumed tokens: 6945767424 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738987E+00 | loss scale: 4096.0 | grad norm: 5.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.674 | TFLOPs: 42.27 | +[default7]: iteration 3313/ 6200 | consumed samples: 3392512 | consumed tokens: 6947864576 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722759E+00 | loss scale: 4096.0 | grad norm: 5.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.729 | TFLOPs: 42.29 | +[default7]: iteration 3314/ 6200 | consumed samples: 3393536 | consumed tokens: 6949961728 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742806E+00 | loss scale: 4096.0 | grad norm: 5.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.265 | TFLOPs: 42.14 | +[default7]: iteration 3315/ 6200 | consumed samples: 3394560 | consumed tokens: 6952058880 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.763344E+00 | loss scale: 4096.0 | grad norm: 5.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.319 | TFLOPs: 42.16 | +[default7]: iteration 3316/ 6200 | consumed samples: 3395584 | consumed tokens: 6954156032 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731150E+00 | loss scale: 4096.0 | grad norm: 5.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.548 | TFLOPs: 42.23 | +[default7]: iteration 3317/ 6200 | consumed samples: 3396608 | consumed tokens: 6956253184 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746446E+00 | loss scale: 4096.0 | grad norm: 5.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.574 | TFLOPs: 42.24 | +[default7]: iteration 3318/ 6200 | consumed samples: 3397632 | consumed tokens: 6958350336 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708719E+00 | loss scale: 4096.0 | grad norm: 6.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.167 | TFLOPs: 42.11 | +[default7]: iteration 3319/ 6200 | consumed samples: 3398656 | consumed tokens: 6960447488 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741322E+00 | loss scale: 4096.0 | grad norm: 5.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.422 | TFLOPs: 42.19 | +[default7]: iteration 3320/ 6200 | consumed samples: 3399680 | consumed tokens: 6962544640 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743302E+00 | loss scale: 4096.0 | grad norm: 6.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.699 | TFLOPs: 42.28 | +[default7]: iteration 3321/ 6200 | consumed samples: 3400704 | consumed tokens: 6964641792 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731825E+00 | loss scale: 4096.0 | grad norm: 5.641 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.042 | TFLOPs: 42.08 | +[default7]: iteration 3322/ 6200 | consumed samples: 3401728 | consumed tokens: 6966738944 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719338E+00 | loss scale: 4096.0 | grad norm: 6.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.650 | TFLOPs: 42.26 | +[default7]: iteration 3323/ 6200 | consumed samples: 3402752 | consumed tokens: 6968836096 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740517E+00 | loss scale: 4096.0 | grad norm: 5.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.820 | TFLOPs: 42.31 | +[default7]: iteration 3324/ 6200 | consumed samples: 3403776 | consumed tokens: 6970933248 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731273E+00 | loss scale: 4096.0 | grad norm: 7.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.852 | TFLOPs: 42.32 | +[default7]: iteration 3325/ 6200 | consumed samples: 3404800 | consumed tokens: 6973030400 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.786556E+00 | loss scale: 4096.0 | grad norm: 5.040 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.578 | TFLOPs: 42.24 | +[default7]: iteration 3326/ 6200 | consumed samples: 3405824 | consumed tokens: 6975127552 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.720266E+00 | loss scale: 4096.0 | grad norm: 5.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.778 | TFLOPs: 42.30 | +[default7]: iteration 3327/ 6200 | consumed samples: 3406848 | consumed tokens: 6977224704 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723231E+00 | loss scale: 4096.0 | grad norm: 7.021 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.115 | TFLOPs: 42.40 | +[default7]: iteration 3328/ 6200 | consumed samples: 3407872 | consumed tokens: 6979321856 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742549E+00 | loss scale: 4096.0 | grad norm: 7.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.703 | TFLOPs: 42.28 | +[default7]: iteration 3329/ 6200 | consumed samples: 3408896 | consumed tokens: 6981419008 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738124E+00 | loss scale: 4096.0 | grad norm: 5.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.793 | TFLOPs: 42.31 | +[default7]: iteration 3330/ 6200 | consumed samples: 3409920 | consumed tokens: 6983516160 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749002E+00 | loss scale: 4096.0 | grad norm: 8.038 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.201 | TFLOPs: 42.43 | +[default7]: iteration 3331/ 6200 | consumed samples: 3410944 | consumed tokens: 6985613312 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727431E+00 | loss scale: 4096.0 | grad norm: 6.700 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.168 | TFLOPs: 42.42 | +[default7]: iteration 3332/ 6200 | consumed samples: 3411968 | consumed tokens: 6987710464 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.734294E+00 | loss scale: 4096.0 | grad norm: 5.810 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.856 | TFLOPs: 42.32 | +[default7]: iteration 3333/ 6200 | consumed samples: 3412992 | consumed tokens: 6989807616 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.782787E+00 | loss scale: 4096.0 | grad norm: 5.024 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.183 | TFLOPs: 42.42 | +[default7]: iteration 3334/ 6200 | consumed samples: 3414016 | consumed tokens: 6991904768 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735946E+00 | loss scale: 4096.0 | grad norm: 5.934 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.264 | TFLOPs: 42.45 | +[default7]: iteration 3335/ 6200 | consumed samples: 3415040 | consumed tokens: 6994001920 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715064E+00 | loss scale: 4096.0 | grad norm: 6.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.331 | TFLOPs: 42.47 | +[default7]: iteration 3336/ 6200 | consumed samples: 3416064 | consumed tokens: 6996099072 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739945E+00 | loss scale: 4096.0 | grad norm: 5.620 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.081 | TFLOPs: 42.39 | +[default7]: iteration 3337/ 6200 | consumed samples: 3417088 | consumed tokens: 6998196224 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.744349E+00 | loss scale: 4096.0 | grad norm: 5.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.929 | TFLOPs: 42.35 | +[default7]: iteration 3338/ 6200 | consumed samples: 3418112 | consumed tokens: 7000293376 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726701E+00 | loss scale: 4096.0 | grad norm: 6.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.918 | TFLOPs: 42.34 | +[default7]: iteration 3339/ 6200 | consumed samples: 3419136 | consumed tokens: 7002390528 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.744771E+00 | loss scale: 4096.0 | grad norm: 6.741 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.250 | TFLOPs: 42.44 | +[default7]: iteration 3340/ 6200 | consumed samples: 3420160 | consumed tokens: 7004487680 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736658E+00 | loss scale: 4096.0 | grad norm: 5.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.968 | TFLOPs: 42.36 | +[default7]: iteration 3341/ 6200 | consumed samples: 3421184 | consumed tokens: 7006584832 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728478E+00 | loss scale: 4096.0 | grad norm: 5.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.885 | TFLOPs: 42.33 | +[default7]: iteration 3342/ 6200 | consumed samples: 3422208 | consumed tokens: 7008681984 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741353E+00 | loss scale: 4096.0 | grad norm: 6.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.839 | TFLOPs: 42.32 | +[default7]: iteration 3343/ 6200 | consumed samples: 3423232 | consumed tokens: 7010779136 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726003E+00 | loss scale: 4096.0 | grad norm: 5.891 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.008 | TFLOPs: 42.37 | +[default7]: iteration 3344/ 6200 | consumed samples: 3424256 | consumed tokens: 7012876288 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.737827E+00 | loss scale: 4096.0 | grad norm: 5.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.021 | TFLOPs: 42.37 | +[default7]: iteration 3345/ 6200 | consumed samples: 3425280 | consumed tokens: 7014973440 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703422E+00 | loss scale: 4096.0 | grad norm: 5.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.322 | TFLOPs: 42.47 | +[default7]: iteration 3346/ 6200 | consumed samples: 3426304 | consumed tokens: 7017070592 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.737500E+00 | loss scale: 4096.0 | grad norm: 4.936 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.168 | TFLOPs: 42.42 | +[default7]: iteration 3347/ 6200 | consumed samples: 3427328 | consumed tokens: 7019167744 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753275E+00 | loss scale: 4096.0 | grad norm: 4.832 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.329 | TFLOPs: 42.47 | +[default7]: iteration 3348/ 6200 | consumed samples: 3428352 | consumed tokens: 7021264896 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.758665E+00 | loss scale: 4096.0 | grad norm: 6.819 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.171 | TFLOPs: 42.42 | +[default7]: iteration 3349/ 6200 | consumed samples: 3429376 | consumed tokens: 7023362048 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731655E+00 | loss scale: 4096.0 | grad norm: 5.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.271 | TFLOPs: 42.45 | +[default7]: iteration 3350/ 6200 | consumed samples: 3430400 | consumed tokens: 7025459200 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739561E+00 | loss scale: 4096.0 | grad norm: 5.648 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.050 | TFLOPs: 42.38 | +[default7]: iteration 3351/ 6200 | consumed samples: 3431424 | consumed tokens: 7027556352 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.762599E+00 | loss scale: 4096.0 | grad norm: 5.781 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.201 | TFLOPs: 42.43 | +[default7]: iteration 3352/ 6200 | consumed samples: 3432448 | consumed tokens: 7029653504 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743475E+00 | loss scale: 4096.0 | grad norm: 5.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.082 | TFLOPs: 42.39 | +[default7]: iteration 3353/ 6200 | consumed samples: 3433472 | consumed tokens: 7031750656 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722967E+00 | loss scale: 4096.0 | grad norm: 6.065 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.252 | TFLOPs: 42.45 | +[default7]: iteration 3354/ 6200 | consumed samples: 3434496 | consumed tokens: 7033847808 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726773E+00 | loss scale: 4096.0 | grad norm: 5.778 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.668 | TFLOPs: 42.27 | +[default7]: iteration 3355/ 6200 | consumed samples: 3435520 | consumed tokens: 7035944960 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.720011E+00 | loss scale: 4096.0 | grad norm: 6.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.056 | TFLOPs: 42.39 | +[default7]: iteration 3356/ 6200 | consumed samples: 3436544 | consumed tokens: 7038042112 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751912E+00 | loss scale: 4096.0 | grad norm: 5.605 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.218 | TFLOPs: 42.43 | +[default7]: iteration 3357/ 6200 | consumed samples: 3437568 | consumed tokens: 7040139264 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750786E+00 | loss scale: 4096.0 | grad norm: 5.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.884 | TFLOPs: 42.33 | +[default7]: iteration 3358/ 6200 | consumed samples: 3438592 | consumed tokens: 7042236416 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.754255E+00 | loss scale: 4096.0 | grad norm: 5.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.002 | TFLOPs: 42.37 | +[default7]: iteration 3359/ 6200 | consumed samples: 3439616 | consumed tokens: 7044333568 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739430E+00 | loss scale: 4096.0 | grad norm: 5.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.432 | TFLOPs: 42.20 | +[default7]: iteration 3360/ 6200 | consumed samples: 3440640 | consumed tokens: 7046430720 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714868E+00 | loss scale: 4096.0 | grad norm: 5.990 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.201 | TFLOPs: 42.12 | +[default7]: iteration 3361/ 6200 | consumed samples: 3441664 | consumed tokens: 7048527872 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731681E+00 | loss scale: 4096.0 | grad norm: 5.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.701 | TFLOPs: 42.28 | +[default7]: iteration 3362/ 6200 | consumed samples: 3442688 | consumed tokens: 7050625024 | elapsed time per iteration (s): 7.29 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765226E+00 | loss scale: 2048.0 | grad norm: 5.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 140.387 | TFLOPs: 42.79 | +[default0]:[2022-10-06 19:56:58,803] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[default7]: iteration 3363/ 6200 | consumed samples: 3443712 | consumed tokens: 7052722176 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753519E+00 | loss scale: 2048.0 | grad norm: 5.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.514 | TFLOPs: 41.92 | +[default7]: iteration 3364/ 6200 | consumed samples: 3444736 | consumed tokens: 7054819328 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728303E+00 | loss scale: 2048.0 | grad norm: 6.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.750 | TFLOPs: 41.99 | +[default7]: iteration 3365/ 6200 | consumed samples: 3445760 | consumed tokens: 7056916480 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750683E+00 | loss scale: 2048.0 | grad norm: 5.953 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.638 | TFLOPs: 41.95 | +[default7]: iteration 3366/ 6200 | consumed samples: 3446784 | consumed tokens: 7059013632 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740375E+00 | loss scale: 2048.0 | grad norm: 4.977 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.936 | TFLOPs: 42.04 | +[default7]: iteration 3367/ 6200 | consumed samples: 3447808 | consumed tokens: 7061110784 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751549E+00 | loss scale: 2048.0 | grad norm: 7.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.465 | TFLOPs: 41.90 | +[default7]: iteration 3368/ 6200 | consumed samples: 3448832 | consumed tokens: 7063207936 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751319E+00 | loss scale: 2048.0 | grad norm: 5.024 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.021 | TFLOPs: 42.07 | +[default7]: iteration 3369/ 6200 | consumed samples: 3449856 | consumed tokens: 7065305088 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.725204E+00 | loss scale: 2048.0 | grad norm: 5.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.965 | TFLOPs: 42.05 | +[default7]: iteration 3370/ 6200 | consumed samples: 3450880 | consumed tokens: 7067402240 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726356E+00 | loss scale: 2048.0 | grad norm: 6.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.157 | TFLOPs: 42.11 | +[default7]: iteration 3371/ 6200 | consumed samples: 3451904 | consumed tokens: 7069499392 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742791E+00 | loss scale: 2048.0 | grad norm: 6.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.315 | TFLOPs: 41.85 | +[default7]: iteration 3372/ 6200 | consumed samples: 3452928 | consumed tokens: 7071596544 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.765797E+00 | loss scale: 2048.0 | grad norm: 6.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.752 | TFLOPs: 41.99 | +[default7]: iteration 3373/ 6200 | consumed samples: 3453952 | consumed tokens: 7073693696 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728126E+00 | loss scale: 2048.0 | grad norm: 6.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.247 | TFLOPs: 42.14 | +[default7]: iteration 3374/ 6200 | consumed samples: 3454976 | consumed tokens: 7075790848 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.734143E+00 | loss scale: 2048.0 | grad norm: 6.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.535 | TFLOPs: 41.92 | +[default7]: iteration 3375/ 6200 | consumed samples: 3456000 | consumed tokens: 7077888000 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710206E+00 | loss scale: 2048.0 | grad norm: 7.583 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.850 | TFLOPs: 42.02 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 3375 | lm loss value: 3.562291E+00 | lm loss PPL: 3.524385E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 3375 | lm loss value: 1.629514E+00 | lm loss PPL: 5.101393E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 3376/ 6200 | consumed samples: 3457024 | consumed tokens: 7079985152 | elapsed time per iteration (s): 52.01 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743368E+00 | loss scale: 2048.0 | grad norm: 6.823 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.687 | TFLOPs: 6.00 | +[default7]: iteration 3377/ 6200 | consumed samples: 3458048 | consumed tokens: 7082082304 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752035E+00 | loss scale: 2048.0 | grad norm: 5.660 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.561 | TFLOPs: 42.23 | +[default7]: iteration 3378/ 6200 | consumed samples: 3459072 | consumed tokens: 7084179456 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739399E+00 | loss scale: 2048.0 | grad norm: 7.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.727 | TFLOPs: 42.29 | +[default7]: iteration 3379/ 6200 | consumed samples: 3460096 | consumed tokens: 7086276608 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719945E+00 | loss scale: 2048.0 | grad norm: 6.010 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.889 | TFLOPs: 42.33 | +[default7]: iteration 3380/ 6200 | consumed samples: 3461120 | consumed tokens: 7088373760 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736133E+00 | loss scale: 2048.0 | grad norm: 7.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.182 | TFLOPs: 42.42 | +[default7]: iteration 3381/ 6200 | consumed samples: 3462144 | consumed tokens: 7090470912 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.769810E+00 | loss scale: 2048.0 | grad norm: 6.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.132 | TFLOPs: 42.41 | +[default7]: iteration 3382/ 6200 | consumed samples: 3463168 | consumed tokens: 7092568064 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709145E+00 | loss scale: 2048.0 | grad norm: 5.613 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.115 | TFLOPs: 42.40 | +[default7]: iteration 3383/ 6200 | consumed samples: 3464192 | consumed tokens: 7094665216 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727208E+00 | loss scale: 2048.0 | grad norm: 6.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.106 | TFLOPs: 42.40 | +[default7]: iteration 3384/ 6200 | consumed samples: 3465216 | consumed tokens: 7096762368 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748116E+00 | loss scale: 2048.0 | grad norm: 5.041 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.194 | TFLOPs: 42.43 | +[default7]: iteration 3385/ 6200 | consumed samples: 3466240 | consumed tokens: 7098859520 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731479E+00 | loss scale: 2048.0 | grad norm: 5.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.360 | TFLOPs: 42.48 | +[default7]: iteration 3386/ 6200 | consumed samples: 3467264 | consumed tokens: 7100956672 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750174E+00 | loss scale: 2048.0 | grad norm: 7.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.954 | TFLOPs: 42.35 | +[default7]: iteration 3387/ 6200 | consumed samples: 3468288 | consumed tokens: 7103053824 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735124E+00 | loss scale: 2048.0 | grad norm: 5.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.259 | TFLOPs: 42.45 | +[default7]: iteration 3388/ 6200 | consumed samples: 3469312 | consumed tokens: 7105150976 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731820E+00 | loss scale: 2048.0 | grad norm: 6.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.698 | TFLOPs: 42.28 | +[default7]: iteration 3389/ 6200 | consumed samples: 3470336 | consumed tokens: 7107248128 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.753738E+00 | loss scale: 2048.0 | grad norm: 6.944 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.725 | TFLOPs: 42.28 | +[default7]: iteration 3390/ 6200 | consumed samples: 3471360 | consumed tokens: 7109345280 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738332E+00 | loss scale: 2048.0 | grad norm: 4.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.883 | TFLOPs: 42.33 | +[default7]: iteration 3391/ 6200 | consumed samples: 3472384 | consumed tokens: 7111442432 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.732820E+00 | loss scale: 2048.0 | grad norm: 5.726 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.618 | TFLOPs: 42.25 | +[default7]: iteration 3392/ 6200 | consumed samples: 3473408 | consumed tokens: 7113539584 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738063E+00 | loss scale: 2048.0 | grad norm: 5.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.187 | TFLOPs: 41.82 | +[default7]: iteration 3393/ 6200 | consumed samples: 3474432 | consumed tokens: 7115636736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738239E+00 | loss scale: 2048.0 | grad norm: 4.996 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.542 | TFLOPs: 42.23 | +[default7]: iteration 3394/ 6200 | consumed samples: 3475456 | consumed tokens: 7117733888 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750835E+00 | loss scale: 2048.0 | grad norm: 6.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.419 | TFLOPs: 42.19 | +[default7]: iteration 3395/ 6200 | consumed samples: 3476480 | consumed tokens: 7119831040 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715604E+00 | loss scale: 2048.0 | grad norm: 5.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.243 | TFLOPs: 42.14 | +[default7]: iteration 3396/ 6200 | consumed samples: 3477504 | consumed tokens: 7121928192 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713602E+00 | loss scale: 2048.0 | grad norm: 5.898 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.401 | TFLOPs: 42.19 | +[default7]: iteration 3397/ 6200 | consumed samples: 3478528 | consumed tokens: 7124025344 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.768553E+00 | loss scale: 2048.0 | grad norm: 5.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.481 | TFLOPs: 42.21 | +[default7]: iteration 3398/ 6200 | consumed samples: 3479552 | consumed tokens: 7126122496 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726994E+00 | loss scale: 2048.0 | grad norm: 6.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.845 | TFLOPs: 42.32 | +[default7]: iteration 3399/ 6200 | consumed samples: 3480576 | consumed tokens: 7128219648 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740639E+00 | loss scale: 2048.0 | grad norm: 5.792 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.124 | TFLOPs: 42.41 | +[default7]: iteration 3400/ 6200 | consumed samples: 3481600 | consumed tokens: 7130316800 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.729271E+00 | loss scale: 2048.0 | grad norm: 5.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.819 | TFLOPs: 42.31 | +[default7]: iteration 3401/ 6200 | consumed samples: 3482624 | consumed tokens: 7132413952 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723002E+00 | loss scale: 2048.0 | grad norm: 6.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.924 | TFLOPs: 42.35 | +[default7]: iteration 3402/ 6200 | consumed samples: 3483648 | consumed tokens: 7134511104 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747888E+00 | loss scale: 2048.0 | grad norm: 5.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.731 | TFLOPs: 42.29 | +[default7]: iteration 3403/ 6200 | consumed samples: 3484672 | consumed tokens: 7136608256 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721607E+00 | loss scale: 2048.0 | grad norm: 5.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.543 | TFLOPs: 42.23 | +[default7]: iteration 3404/ 6200 | consumed samples: 3485696 | consumed tokens: 7138705408 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752740E+00 | loss scale: 2048.0 | grad norm: 5.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.455 | TFLOPs: 42.20 | +[default7]: iteration 3405/ 6200 | consumed samples: 3486720 | consumed tokens: 7140802560 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738633E+00 | loss scale: 2048.0 | grad norm: 6.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.499 | TFLOPs: 42.22 | +[default7]: iteration 3406/ 6200 | consumed samples: 3487744 | consumed tokens: 7142899712 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.755668E+00 | loss scale: 2048.0 | grad norm: 5.740 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.599 | TFLOPs: 42.25 | +[default7]: iteration 3407/ 6200 | consumed samples: 3488768 | consumed tokens: 7144996864 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.725163E+00 | loss scale: 2048.0 | grad norm: 5.639 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.276 | TFLOPs: 42.15 | +[default7]: iteration 3408/ 6200 | consumed samples: 3489792 | consumed tokens: 7147094016 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739125E+00 | loss scale: 2048.0 | grad norm: 5.771 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.427 | TFLOPs: 42.19 | +[default7]: iteration 3409/ 6200 | consumed samples: 3490816 | consumed tokens: 7149191168 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741944E+00 | loss scale: 2048.0 | grad norm: 6.018 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.633 | TFLOPs: 42.26 | +[default7]: iteration 3410/ 6200 | consumed samples: 3491840 | consumed tokens: 7151288320 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713237E+00 | loss scale: 2048.0 | grad norm: 5.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.924 | TFLOPs: 42.35 | +[default7]: iteration 3411/ 6200 | consumed samples: 3492864 | consumed tokens: 7153385472 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741462E+00 | loss scale: 2048.0 | grad norm: 5.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.899 | TFLOPs: 42.34 | +[default7]: iteration 3412/ 6200 | consumed samples: 3493888 | consumed tokens: 7155482624 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.729986E+00 | loss scale: 2048.0 | grad norm: 5.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.646 | TFLOPs: 42.26 | +[default7]: iteration 3413/ 6200 | consumed samples: 3494912 | consumed tokens: 7157579776 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742150E+00 | loss scale: 2048.0 | grad norm: 5.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.949 | TFLOPs: 42.35 | +[default7]: iteration 3414/ 6200 | consumed samples: 3495936 | consumed tokens: 7159676928 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731515E+00 | loss scale: 2048.0 | grad norm: 5.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.602 | TFLOPs: 42.25 | +[default7]: iteration 3415/ 6200 | consumed samples: 3496960 | consumed tokens: 7161774080 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726866E+00 | loss scale: 2048.0 | grad norm: 5.648 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.542 | TFLOPs: 42.23 | +[default7]: iteration 3416/ 6200 | consumed samples: 3497984 | consumed tokens: 7163871232 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722762E+00 | loss scale: 2048.0 | grad norm: 5.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.044 | TFLOPs: 42.08 | +[default7]: iteration 3417/ 6200 | consumed samples: 3499008 | consumed tokens: 7165968384 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.737657E+00 | loss scale: 2048.0 | grad norm: 5.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.452 | TFLOPs: 42.20 | +[default7]: iteration 3418/ 6200 | consumed samples: 3500032 | consumed tokens: 7168065536 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.737065E+00 | loss scale: 2048.0 | grad norm: 4.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.206 | TFLOPs: 42.13 | +[default7]: iteration 3419/ 6200 | consumed samples: 3501056 | consumed tokens: 7170162688 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735915E+00 | loss scale: 2048.0 | grad norm: 6.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.472 | TFLOPs: 42.21 | +[default7]: iteration 3420/ 6200 | consumed samples: 3502080 | consumed tokens: 7172259840 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718592E+00 | loss scale: 2048.0 | grad norm: 6.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.428 | TFLOPs: 42.19 | +[default7]: iteration 3421/ 6200 | consumed samples: 3503104 | consumed tokens: 7174356992 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743881E+00 | loss scale: 2048.0 | grad norm: 5.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.924 | TFLOPs: 42.04 | +[default7]: iteration 3422/ 6200 | consumed samples: 3504128 | consumed tokens: 7176454144 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718534E+00 | loss scale: 2048.0 | grad norm: 5.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.910 | TFLOPs: 42.04 | +[default7]: iteration 3423/ 6200 | consumed samples: 3505152 | consumed tokens: 7178551296 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728031E+00 | loss scale: 2048.0 | grad norm: 4.874 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.167 | TFLOPs: 42.11 | +[default7]: iteration 3424/ 6200 | consumed samples: 3506176 | consumed tokens: 7180648448 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751804E+00 | loss scale: 2048.0 | grad norm: 5.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.222 | TFLOPs: 42.13 | +[default7]: iteration 3425/ 6200 | consumed samples: 3507200 | consumed tokens: 7182745600 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740746E+00 | loss scale: 2048.0 | grad norm: 5.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.189 | TFLOPs: 41.82 | +[default7]: iteration 3426/ 6200 | consumed samples: 3508224 | consumed tokens: 7184842752 | elapsed time per iteration (s): 7.48 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743646E+00 | loss scale: 2048.0 | grad norm: 5.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.820 | TFLOPs: 41.70 | +[default7]: iteration 3427/ 6200 | consumed samples: 3509248 | consumed tokens: 7186939904 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727971E+00 | loss scale: 2048.0 | grad norm: 5.798 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.827 | TFLOPs: 42.01 | +[default7]: iteration 3428/ 6200 | consumed samples: 3510272 | consumed tokens: 7189037056 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.730709E+00 | loss scale: 2048.0 | grad norm: 5.790 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.685 | TFLOPs: 42.27 | +[default7]: iteration 3429/ 6200 | consumed samples: 3511296 | consumed tokens: 7191134208 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.769708E+00 | loss scale: 2048.0 | grad norm: 5.977 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.102 | TFLOPs: 41.79 | +[default7]: iteration 3430/ 6200 | consumed samples: 3512320 | consumed tokens: 7193231360 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722686E+00 | loss scale: 2048.0 | grad norm: 5.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.026 | TFLOPs: 42.07 | +[default7]: iteration 3431/ 6200 | consumed samples: 3513344 | consumed tokens: 7195328512 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728509E+00 | loss scale: 2048.0 | grad norm: 5.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.320 | TFLOPs: 42.16 | +[default7]: iteration 3432/ 6200 | consumed samples: 3514368 | consumed tokens: 7197425664 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.730343E+00 | loss scale: 2048.0 | grad norm: 6.569 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.035 | TFLOPs: 42.07 | +[default7]: iteration 3433/ 6200 | consumed samples: 3515392 | consumed tokens: 7199522816 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742130E+00 | loss scale: 2048.0 | grad norm: 4.971 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.477 | TFLOPs: 42.21 | +[default7]: iteration 3434/ 6200 | consumed samples: 3516416 | consumed tokens: 7201619968 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706722E+00 | loss scale: 2048.0 | grad norm: 5.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.998 | TFLOPs: 42.06 | +[default7]: iteration 3435/ 6200 | consumed samples: 3517440 | consumed tokens: 7203717120 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.729213E+00 | loss scale: 2048.0 | grad norm: 6.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.273 | TFLOPs: 42.15 | +[default7]: iteration 3436/ 6200 | consumed samples: 3518464 | consumed tokens: 7205814272 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.732630E+00 | loss scale: 2048.0 | grad norm: 5.739 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.933 | TFLOPs: 42.04 | +[default7]: iteration 3437/ 6200 | consumed samples: 3519488 | consumed tokens: 7207911424 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711884E+00 | loss scale: 2048.0 | grad norm: 6.074 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.319 | TFLOPs: 42.16 | +[default7]: iteration 3438/ 6200 | consumed samples: 3520512 | consumed tokens: 7210008576 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736591E+00 | loss scale: 2048.0 | grad norm: 5.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.707 | TFLOPs: 41.97 | +[default7]: iteration 3439/ 6200 | consumed samples: 3521536 | consumed tokens: 7212105728 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.732315E+00 | loss scale: 2048.0 | grad norm: 5.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.739 | TFLOPs: 41.98 | +[default7]: iteration 3440/ 6200 | consumed samples: 3522560 | consumed tokens: 7214202880 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747471E+00 | loss scale: 2048.0 | grad norm: 6.848 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.104 | TFLOPs: 42.10 | +[default7]: iteration 3441/ 6200 | consumed samples: 3523584 | consumed tokens: 7216300032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748158E+00 | loss scale: 2048.0 | grad norm: 6.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.851 | TFLOPs: 42.32 | +[default7]: iteration 3442/ 6200 | consumed samples: 3524608 | consumed tokens: 7218397184 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.730173E+00 | loss scale: 2048.0 | grad norm: 5.067 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.724 | TFLOPs: 42.28 | +[default7]: iteration 3443/ 6200 | consumed samples: 3525632 | consumed tokens: 7220494336 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756760E+00 | loss scale: 2048.0 | grad norm: 6.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.685 | TFLOPs: 42.27 | +[default7]: iteration 3444/ 6200 | consumed samples: 3526656 | consumed tokens: 7222591488 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726987E+00 | loss scale: 2048.0 | grad norm: 5.925 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.912 | TFLOPs: 42.34 | +[default7]: iteration 3445/ 6200 | consumed samples: 3527680 | consumed tokens: 7224688640 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.720786E+00 | loss scale: 2048.0 | grad norm: 5.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.307 | TFLOPs: 42.16 | +[default7]: iteration 3446/ 6200 | consumed samples: 3528704 | consumed tokens: 7226785792 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735478E+00 | loss scale: 2048.0 | grad norm: 4.947 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.395 | TFLOPs: 42.18 | +[default7]: iteration 3447/ 6200 | consumed samples: 3529728 | consumed tokens: 7228882944 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739986E+00 | loss scale: 2048.0 | grad norm: 5.044 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.343 | TFLOPs: 42.17 | +[default7]: iteration 3448/ 6200 | consumed samples: 3530752 | consumed tokens: 7230980096 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693681E+00 | loss scale: 2048.0 | grad norm: 6.729 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.661 | TFLOPs: 42.27 | +[default7]: iteration 3449/ 6200 | consumed samples: 3531776 | consumed tokens: 7233077248 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736665E+00 | loss scale: 2048.0 | grad norm: 5.936 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.621 | TFLOPs: 42.25 | +[default7]: iteration 3450/ 6200 | consumed samples: 3532800 | consumed tokens: 7235174400 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.744334E+00 | loss scale: 2048.0 | grad norm: 5.988 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.196 | TFLOPs: 42.12 | +[default7]: iteration 3451/ 6200 | consumed samples: 3533824 | consumed tokens: 7237271552 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701698E+00 | loss scale: 2048.0 | grad norm: 5.639 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.490 | TFLOPs: 42.21 | +[default7]: iteration 3452/ 6200 | consumed samples: 3534848 | consumed tokens: 7239368704 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724404E+00 | loss scale: 2048.0 | grad norm: 5.047 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.785 | TFLOPs: 42.30 | +[default7]: iteration 3453/ 6200 | consumed samples: 3535872 | consumed tokens: 7241465856 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723973E+00 | loss scale: 2048.0 | grad norm: 5.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.531 | TFLOPs: 42.23 | +[default7]: iteration 3454/ 6200 | consumed samples: 3536896 | consumed tokens: 7243563008 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.732089E+00 | loss scale: 2048.0 | grad norm: 5.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.342 | TFLOPs: 42.17 | +[default7]: iteration 3455/ 6200 | consumed samples: 3537920 | consumed tokens: 7245660160 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.770308E+00 | loss scale: 2048.0 | grad norm: 6.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.780 | TFLOPs: 42.30 | +[default7]: iteration 3456/ 6200 | consumed samples: 3538944 | consumed tokens: 7247757312 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748077E+00 | loss scale: 2048.0 | grad norm: 5.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.426 | TFLOPs: 42.19 | +[default7]: iteration 3457/ 6200 | consumed samples: 3539968 | consumed tokens: 7249854464 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723895E+00 | loss scale: 2048.0 | grad norm: 5.739 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.394 | TFLOPs: 42.18 | +[default7]: iteration 3458/ 6200 | consumed samples: 3540992 | consumed tokens: 7251951616 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743331E+00 | loss scale: 2048.0 | grad norm: 5.996 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.528 | TFLOPs: 42.22 | +[default7]: iteration 3459/ 6200 | consumed samples: 3542016 | consumed tokens: 7254048768 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718921E+00 | loss scale: 2048.0 | grad norm: 6.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.110 | TFLOPs: 42.10 | +[default7]: iteration 3460/ 6200 | consumed samples: 3543040 | consumed tokens: 7256145920 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.729705E+00 | loss scale: 2048.0 | grad norm: 6.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.774 | TFLOPs: 42.30 | +[default7]: iteration 3461/ 6200 | consumed samples: 3544064 | consumed tokens: 7258243072 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719736E+00 | loss scale: 2048.0 | grad norm: 5.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.851 | TFLOPs: 42.32 | +[default7]: iteration 3462/ 6200 | consumed samples: 3545088 | consumed tokens: 7260340224 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723919E+00 | loss scale: 2048.0 | grad norm: 4.833 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.857 | TFLOPs: 42.32 | +[default7]: iteration 3463/ 6200 | consumed samples: 3546112 | consumed tokens: 7262437376 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723696E+00 | loss scale: 2048.0 | grad norm: 5.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.837 | TFLOPs: 42.01 | +[default7]: iteration 3464/ 6200 | consumed samples: 3547136 | consumed tokens: 7264534528 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710495E+00 | loss scale: 2048.0 | grad norm: 5.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.947 | TFLOPs: 42.35 | +[default7]: iteration 3465/ 6200 | consumed samples: 3548160 | consumed tokens: 7266631680 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703569E+00 | loss scale: 2048.0 | grad norm: 5.863 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.004 | TFLOPs: 42.37 | +[default7]: iteration 3466/ 6200 | consumed samples: 3549184 | consumed tokens: 7268728832 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752050E+00 | loss scale: 2048.0 | grad norm: 5.577 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.794 | TFLOPs: 42.31 | +[default7]: iteration 3467/ 6200 | consumed samples: 3550208 | consumed tokens: 7270825984 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.745232E+00 | loss scale: 2048.0 | grad norm: 6.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.071 | TFLOPs: 42.09 | +[default7]: iteration 3468/ 6200 | consumed samples: 3551232 | consumed tokens: 7272923136 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726676E+00 | loss scale: 2048.0 | grad norm: 5.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.474 | TFLOPs: 42.21 | +[default7]: iteration 3469/ 6200 | consumed samples: 3552256 | consumed tokens: 7275020288 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742445E+00 | loss scale: 2048.0 | grad norm: 6.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.769 | TFLOPs: 42.30 | +[default7]: iteration 3470/ 6200 | consumed samples: 3553280 | consumed tokens: 7277117440 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721115E+00 | loss scale: 2048.0 | grad norm: 5.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.768 | TFLOPs: 42.30 | +[default7]: iteration 3471/ 6200 | consumed samples: 3554304 | consumed tokens: 7279214592 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697466E+00 | loss scale: 2048.0 | grad norm: 5.039 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.126 | TFLOPs: 42.41 | +[default7]: iteration 3472/ 6200 | consumed samples: 3555328 | consumed tokens: 7281311744 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735451E+00 | loss scale: 2048.0 | grad norm: 5.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.080 | TFLOPs: 42.39 | +[default7]: iteration 3473/ 6200 | consumed samples: 3556352 | consumed tokens: 7283408896 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.744465E+00 | loss scale: 2048.0 | grad norm: 5.739 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.150 | TFLOPs: 42.41 | +[default7]: iteration 3474/ 6200 | consumed samples: 3557376 | consumed tokens: 7285506048 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721425E+00 | loss scale: 2048.0 | grad norm: 5.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.082 | TFLOPs: 42.39 | +[default7]: iteration 3475/ 6200 | consumed samples: 3558400 | consumed tokens: 7287603200 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692753E+00 | loss scale: 2048.0 | grad norm: 5.899 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.949 | TFLOPs: 42.35 | +[default7]: iteration 3476/ 6200 | consumed samples: 3559424 | consumed tokens: 7289700352 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748190E+00 | loss scale: 2048.0 | grad norm: 6.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.875 | TFLOPs: 42.33 | +[default7]: iteration 3477/ 6200 | consumed samples: 3560448 | consumed tokens: 7291797504 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.737726E+00 | loss scale: 2048.0 | grad norm: 4.922 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.024 | TFLOPs: 42.38 | +[default7]: iteration 3478/ 6200 | consumed samples: 3561472 | consumed tokens: 7293894656 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746747E+00 | loss scale: 2048.0 | grad norm: 5.828 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.149 | TFLOPs: 42.41 | +[default7]: iteration 3479/ 6200 | consumed samples: 3562496 | consumed tokens: 7295991808 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718113E+00 | loss scale: 2048.0 | grad norm: 6.830 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.116 | TFLOPs: 42.40 | +[default7]: iteration 3480/ 6200 | consumed samples: 3563520 | consumed tokens: 7298088960 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704536E+00 | loss scale: 2048.0 | grad norm: 5.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.149 | TFLOPs: 42.41 | +[default7]: iteration 3481/ 6200 | consumed samples: 3564544 | consumed tokens: 7300186112 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.730567E+00 | loss scale: 2048.0 | grad norm: 5.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.668 | TFLOPs: 42.27 | +[default7]: iteration 3482/ 6200 | consumed samples: 3565568 | consumed tokens: 7302283264 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739876E+00 | loss scale: 2048.0 | grad norm: 6.006 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.316 | TFLOPs: 42.16 | +[default7]: iteration 3483/ 6200 | consumed samples: 3566592 | consumed tokens: 7304380416 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711492E+00 | loss scale: 2048.0 | grad norm: 5.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.642 | TFLOPs: 42.26 | +[default7]: iteration 3484/ 6200 | consumed samples: 3567616 | consumed tokens: 7306477568 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.729779E+00 | loss scale: 2048.0 | grad norm: 5.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.117 | TFLOPs: 42.40 | +[default7]: iteration 3485/ 6200 | consumed samples: 3568640 | consumed tokens: 7308574720 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724417E+00 | loss scale: 2048.0 | grad norm: 5.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.259 | TFLOPs: 42.45 | +[default7]: iteration 3486/ 6200 | consumed samples: 3569664 | consumed tokens: 7310671872 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717359E+00 | loss scale: 2048.0 | grad norm: 5.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.258 | TFLOPs: 42.45 | +[default7]: iteration 3487/ 6200 | consumed samples: 3570688 | consumed tokens: 7312769024 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739678E+00 | loss scale: 2048.0 | grad norm: 5.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.241 | TFLOPs: 42.44 | +[default7]: iteration 3488/ 6200 | consumed samples: 3571712 | consumed tokens: 7314866176 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.744864E+00 | loss scale: 2048.0 | grad norm: 5.968 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.925 | TFLOPs: 42.35 | +[default7]: iteration 3489/ 6200 | consumed samples: 3572736 | consumed tokens: 7316963328 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702112E+00 | loss scale: 2048.0 | grad norm: 4.871 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.264 | TFLOPs: 42.45 | +[default7]: iteration 3490/ 6200 | consumed samples: 3573760 | consumed tokens: 7319060480 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713216E+00 | loss scale: 2048.0 | grad norm: 5.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.295 | TFLOPs: 42.46 | +[default7]: iteration 3491/ 6200 | consumed samples: 3574784 | consumed tokens: 7321157632 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704569E+00 | loss scale: 2048.0 | grad norm: 7.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.274 | TFLOPs: 42.45 | +[default7]: iteration 3492/ 6200 | consumed samples: 3575808 | consumed tokens: 7323254784 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733226E+00 | loss scale: 2048.0 | grad norm: 5.649 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.380 | TFLOPs: 42.48 | +[default7]: iteration 3493/ 6200 | consumed samples: 3576832 | consumed tokens: 7325351936 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.720656E+00 | loss scale: 2048.0 | grad norm: 5.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.225 | TFLOPs: 42.44 | +[default7]: iteration 3494/ 6200 | consumed samples: 3577856 | consumed tokens: 7327449088 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718033E+00 | loss scale: 2048.0 | grad norm: 5.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.159 | TFLOPs: 42.42 | +[default7]: iteration 3495/ 6200 | consumed samples: 3578880 | consumed tokens: 7329546240 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748834E+00 | loss scale: 2048.0 | grad norm: 5.032 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.293 | TFLOPs: 42.46 | +[default7]: iteration 3496/ 6200 | consumed samples: 3579904 | consumed tokens: 7331643392 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718526E+00 | loss scale: 2048.0 | grad norm: 7.798 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.395 | TFLOPs: 42.49 | +[default7]: iteration 3497/ 6200 | consumed samples: 3580928 | consumed tokens: 7333740544 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724220E+00 | loss scale: 2048.0 | grad norm: 5.779 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.488 | TFLOPs: 42.52 | +[default7]: iteration 3498/ 6200 | consumed samples: 3581952 | consumed tokens: 7335837696 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.720779E+00 | loss scale: 2048.0 | grad norm: 5.762 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.051 | TFLOPs: 42.38 | +[default7]: iteration 3499/ 6200 | consumed samples: 3582976 | consumed tokens: 7337934848 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751513E+00 | loss scale: 2048.0 | grad norm: 7.679 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.182 | TFLOPs: 42.42 | +[default7]: iteration 3500/ 6200 | consumed samples: 3584000 | consumed tokens: 7340032000 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.744795E+00 | loss scale: 2048.0 | grad norm: 6.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.615 | TFLOPs: 42.25 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 3500 | lm loss value: 3.558800E+00 | lm loss PPL: 3.512101E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 3500 | lm loss value: 1.620214E+00 | lm loss PPL: 5.054174E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 3500 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 20:15:27,491] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3500 is begin to save! +[default0]:[2022-10-06 20:15:27,498] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:27,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:27,901] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:27,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:27,934] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:27,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:27,964] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:27,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:27,993] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,080] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,109] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,169] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,228] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,286] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,314] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,400] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,430] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,457] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,485] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,572] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 20:15:28,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,600] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 20:15:28,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 20:15:28,602] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/mp_rank_00_model_states.pt +[default0]:[2022-10-06 20:15:28,602] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 20:15:28,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 20:15:28,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 20:15:28,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 20:15:28,811] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 20:15:28,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 20:15:28,800] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 20:15:28,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 20:15:28,830] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 20:15:28,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 20:15:28,834] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 20:15:28,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 20:15:28,812] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 20:15:28,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 20:15:28,919] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 20:15:28,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 20:15:28,905] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 20:15:28,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 20:15:28,904] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 20:15:28,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 20:15:28,922] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 20:15:28,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 20:15:28,898] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 20:15:28,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 20:15:28,866] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 20:15:28,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 20:15:28,888] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 20:15:28,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 20:15:28,923] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 20:15:28,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 20:15:28,945] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 20:15:28,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 20:15:28,902] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 20:15:28,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 20:15:28,878] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 20:15:28,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 20:15:28,934] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 20:15:28,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 20:15:28,891] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 20:15:28,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 20:15:28,904] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 20:15:28,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 20:15:28,938] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 20:15:28,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 20:15:28,928] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 20:15:28,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 20:15:28,997] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 20:15:28,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 20:15:28,903] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 20:15:28,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 20:15:28,995] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 20:15:28,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 20:15:28,939] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 20:15:28,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 20:15:28,959] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default4]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default1]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default7]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default1]:[2022-10-06 20:15:28,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 20:15:28,994] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default1]:[2022-10-06 20:15:28,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 20:15:28,935] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default4]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default3]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default4]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default5]:[2022-10-06 20:15:28,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 20:15:28,936] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default3]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default2]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default3]:[2022-10-06 20:15:28,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 20:15:28,974] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default7]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 20:15:29,017] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default7]:time (ms) | save-checkpoint: 1527.51 +[default7]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default0]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default7]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default2]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default2]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default6]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default6]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default4]:[2022-10-06 20:15:29,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 20:15:29,011] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3500/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default3]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default0]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default0]: successfully saved checkpoint at iteration 3500 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default5]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default5]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default6]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default0]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default1]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default5]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default6]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default0]:[2022-10-06 20:15:29,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3500 is ready now! +[default7]: iteration 3501/ 6200 | consumed samples: 3585024 | consumed tokens: 7342129152 | elapsed time per iteration (s): 53.03 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733994E+00 | loss scale: 2048.0 | grad norm: 5.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.308 | TFLOPs: 5.89 | +[default7]: iteration 3502/ 6200 | consumed samples: 3586048 | consumed tokens: 7344226304 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709012E+00 | loss scale: 2048.0 | grad norm: 6.522 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.862 | TFLOPs: 42.33 | +[default7]: iteration 3503/ 6200 | consumed samples: 3587072 | consumed tokens: 7346323456 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749607E+00 | loss scale: 2048.0 | grad norm: 6.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.828 | TFLOPs: 42.32 | +[default7]: iteration 3504/ 6200 | consumed samples: 3588096 | consumed tokens: 7348420608 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712521E+00 | loss scale: 2048.0 | grad norm: 5.973 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.495 | TFLOPs: 42.21 | +[default7]: iteration 3505/ 6200 | consumed samples: 3589120 | consumed tokens: 7350517760 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746823E+00 | loss scale: 2048.0 | grad norm: 5.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.840 | TFLOPs: 42.32 | +[default7]: iteration 3506/ 6200 | consumed samples: 3590144 | consumed tokens: 7352614912 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727048E+00 | loss scale: 2048.0 | grad norm: 5.594 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.565 | TFLOPs: 42.24 | +[default7]: iteration 3507/ 6200 | consumed samples: 3591168 | consumed tokens: 7354712064 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738829E+00 | loss scale: 2048.0 | grad norm: 10.980 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.808 | TFLOPs: 42.31 | +[default7]: iteration 3508/ 6200 | consumed samples: 3592192 | consumed tokens: 7356809216 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.744153E+00 | loss scale: 2048.0 | grad norm: 5.904 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.350 | TFLOPs: 42.17 | +[default7]: iteration 3509/ 6200 | consumed samples: 3593216 | consumed tokens: 7358906368 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751504E+00 | loss scale: 2048.0 | grad norm: 5.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.678 | TFLOPs: 42.27 | +[default7]: iteration 3510/ 6200 | consumed samples: 3594240 | consumed tokens: 7361003520 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700715E+00 | loss scale: 2048.0 | grad norm: 6.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.819 | TFLOPs: 42.31 | +[default7]: iteration 3511/ 6200 | consumed samples: 3595264 | consumed tokens: 7363100672 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741822E+00 | loss scale: 2048.0 | grad norm: 6.071 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.931 | TFLOPs: 42.35 | +[default7]: iteration 3512/ 6200 | consumed samples: 3596288 | consumed tokens: 7365197824 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.734398E+00 | loss scale: 2048.0 | grad norm: 5.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.913 | TFLOPs: 42.34 | +[default7]: iteration 3513/ 6200 | consumed samples: 3597312 | consumed tokens: 7367294976 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736318E+00 | loss scale: 2048.0 | grad norm: 6.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.080 | TFLOPs: 42.39 | +[default7]: iteration 3514/ 6200 | consumed samples: 3598336 | consumed tokens: 7369392128 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747972E+00 | loss scale: 2048.0 | grad norm: 5.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.868 | TFLOPs: 42.33 | +[default7]: iteration 3515/ 6200 | consumed samples: 3599360 | consumed tokens: 7371489280 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.772324E+00 | loss scale: 2048.0 | grad norm: 5.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.647 | TFLOPs: 42.26 | +[default7]: iteration 3516/ 6200 | consumed samples: 3600384 | consumed tokens: 7373586432 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704646E+00 | loss scale: 2048.0 | grad norm: 5.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.780 | TFLOPs: 42.30 | +[default7]: iteration 3517/ 6200 | consumed samples: 3601408 | consumed tokens: 7375683584 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743175E+00 | loss scale: 2048.0 | grad norm: 5.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.766 | TFLOPs: 42.30 | +[default7]: iteration 3518/ 6200 | consumed samples: 3602432 | consumed tokens: 7377780736 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709779E+00 | loss scale: 2048.0 | grad norm: 5.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.833 | TFLOPs: 42.32 | +[default7]: iteration 3519/ 6200 | consumed samples: 3603456 | consumed tokens: 7379877888 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.725194E+00 | loss scale: 2048.0 | grad norm: 5.866 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.940 | TFLOPs: 42.35 | +[default7]: iteration 3520/ 6200 | consumed samples: 3604480 | consumed tokens: 7381975040 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740307E+00 | loss scale: 2048.0 | grad norm: 5.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.723 | TFLOPs: 42.28 | +[default7]: iteration 3521/ 6200 | consumed samples: 3605504 | consumed tokens: 7384072192 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739879E+00 | loss scale: 2048.0 | grad norm: 4.762 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.732 | TFLOPs: 42.29 | +[default7]: iteration 3522/ 6200 | consumed samples: 3606528 | consumed tokens: 7386169344 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746819E+00 | loss scale: 2048.0 | grad norm: 5.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.316 | TFLOPs: 42.16 | +[default7]: iteration 3523/ 6200 | consumed samples: 3607552 | consumed tokens: 7388266496 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716792E+00 | loss scale: 2048.0 | grad norm: 5.715 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.653 | TFLOPs: 42.26 | +[default7]: iteration 3524/ 6200 | consumed samples: 3608576 | consumed tokens: 7390363648 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709680E+00 | loss scale: 2048.0 | grad norm: 5.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.043 | TFLOPs: 42.38 | +[default7]: iteration 3525/ 6200 | consumed samples: 3609600 | consumed tokens: 7392460800 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708418E+00 | loss scale: 2048.0 | grad norm: 6.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.643 | TFLOPs: 42.26 | +[default7]: iteration 3526/ 6200 | consumed samples: 3610624 | consumed tokens: 7394557952 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719797E+00 | loss scale: 2048.0 | grad norm: 5.672 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.917 | TFLOPs: 42.34 | +[default7]: iteration 3527/ 6200 | consumed samples: 3611648 | consumed tokens: 7396655104 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735420E+00 | loss scale: 2048.0 | grad norm: 4.923 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.932 | TFLOPs: 42.35 | +[default7]: iteration 3528/ 6200 | consumed samples: 3612672 | consumed tokens: 7398752256 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735139E+00 | loss scale: 2048.0 | grad norm: 5.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.546 | TFLOPs: 42.23 | +[default7]: iteration 3529/ 6200 | consumed samples: 3613696 | consumed tokens: 7400849408 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740283E+00 | loss scale: 2048.0 | grad norm: 5.718 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.757 | TFLOPs: 42.29 | +[default7]: iteration 3530/ 6200 | consumed samples: 3614720 | consumed tokens: 7402946560 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733229E+00 | loss scale: 2048.0 | grad norm: 5.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.654 | TFLOPs: 42.26 | +[default7]: iteration 3531/ 6200 | consumed samples: 3615744 | consumed tokens: 7405043712 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715860E+00 | loss scale: 2048.0 | grad norm: 6.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.786 | TFLOPs: 42.30 | +[default7]: iteration 3532/ 6200 | consumed samples: 3616768 | consumed tokens: 7407140864 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742327E+00 | loss scale: 2048.0 | grad norm: 5.604 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.788 | TFLOPs: 42.30 | +[default7]: iteration 3533/ 6200 | consumed samples: 3617792 | consumed tokens: 7409238016 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715074E+00 | loss scale: 2048.0 | grad norm: 5.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.570 | TFLOPs: 42.24 | +[default7]: iteration 3534/ 6200 | consumed samples: 3618816 | consumed tokens: 7411335168 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746307E+00 | loss scale: 2048.0 | grad norm: 5.013 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.454 | TFLOPs: 42.20 | +[default7]: iteration 3535/ 6200 | consumed samples: 3619840 | consumed tokens: 7413432320 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748363E+00 | loss scale: 2048.0 | grad norm: 6.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.542 | TFLOPs: 42.23 | +[default7]: iteration 3536/ 6200 | consumed samples: 3620864 | consumed tokens: 7415529472 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697111E+00 | loss scale: 2048.0 | grad norm: 4.791 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.679 | TFLOPs: 42.27 | +[default7]: iteration 3537/ 6200 | consumed samples: 3621888 | consumed tokens: 7417626624 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717208E+00 | loss scale: 2048.0 | grad norm: 5.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.784 | TFLOPs: 42.30 | +[default7]: iteration 3538/ 6200 | consumed samples: 3622912 | consumed tokens: 7419723776 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736026E+00 | loss scale: 2048.0 | grad norm: 5.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.617 | TFLOPs: 42.25 | +[default7]: iteration 3539/ 6200 | consumed samples: 3623936 | consumed tokens: 7421820928 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.725942E+00 | loss scale: 2048.0 | grad norm: 6.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.686 | TFLOPs: 42.27 | +[default7]: iteration 3540/ 6200 | consumed samples: 3624960 | consumed tokens: 7423918080 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717314E+00 | loss scale: 2048.0 | grad norm: 5.976 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.620 | TFLOPs: 42.25 | +[default7]: iteration 3541/ 6200 | consumed samples: 3625984 | consumed tokens: 7426015232 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719729E+00 | loss scale: 2048.0 | grad norm: 5.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.382 | TFLOPs: 42.18 | +[default7]: iteration 3542/ 6200 | consumed samples: 3627008 | consumed tokens: 7428112384 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722763E+00 | loss scale: 2048.0 | grad norm: 5.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.688 | TFLOPs: 42.27 | +[default7]: iteration 3543/ 6200 | consumed samples: 3628032 | consumed tokens: 7430209536 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.720465E+00 | loss scale: 2048.0 | grad norm: 5.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.379 | TFLOPs: 42.18 | +[default7]: iteration 3544/ 6200 | consumed samples: 3629056 | consumed tokens: 7432306688 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738193E+00 | loss scale: 2048.0 | grad norm: 6.881 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.756 | TFLOPs: 42.29 | +[default7]: iteration 3545/ 6200 | consumed samples: 3630080 | consumed tokens: 7434403840 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733186E+00 | loss scale: 2048.0 | grad norm: 5.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.858 | TFLOPs: 42.33 | +[default7]: iteration 3546/ 6200 | consumed samples: 3631104 | consumed tokens: 7436500992 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697346E+00 | loss scale: 2048.0 | grad norm: 5.673 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.902 | TFLOPs: 42.34 | +[default7]: iteration 3547/ 6200 | consumed samples: 3632128 | consumed tokens: 7438598144 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711894E+00 | loss scale: 2048.0 | grad norm: 6.761 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.780 | TFLOPs: 42.30 | +[default7]: iteration 3548/ 6200 | consumed samples: 3633152 | consumed tokens: 7440695296 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.732371E+00 | loss scale: 2048.0 | grad norm: 5.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.986 | TFLOPs: 42.06 | +[default7]: iteration 3549/ 6200 | consumed samples: 3634176 | consumed tokens: 7442792448 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703265E+00 | loss scale: 2048.0 | grad norm: 5.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.736 | TFLOPs: 42.29 | +[default7]: iteration 3550/ 6200 | consumed samples: 3635200 | consumed tokens: 7444889600 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707756E+00 | loss scale: 2048.0 | grad norm: 4.709 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.022 | TFLOPs: 42.38 | +[default7]: iteration 3551/ 6200 | consumed samples: 3636224 | consumed tokens: 7446986752 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727936E+00 | loss scale: 2048.0 | grad norm: 5.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.758 | TFLOPs: 42.29 | +[default7]: iteration 3552/ 6200 | consumed samples: 3637248 | consumed tokens: 7449083904 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719818E+00 | loss scale: 2048.0 | grad norm: 5.047 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.869 | TFLOPs: 42.33 | +[default7]: iteration 3553/ 6200 | consumed samples: 3638272 | consumed tokens: 7451181056 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728688E+00 | loss scale: 2048.0 | grad norm: 5.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.001 | TFLOPs: 42.37 | +[default7]: iteration 3554/ 6200 | consumed samples: 3639296 | consumed tokens: 7453278208 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704892E+00 | loss scale: 2048.0 | grad norm: 5.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.006 | TFLOPs: 42.37 | +[default7]: iteration 3555/ 6200 | consumed samples: 3640320 | consumed tokens: 7455375360 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728771E+00 | loss scale: 2048.0 | grad norm: 5.711 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.973 | TFLOPs: 42.36 | +[default7]: iteration 3556/ 6200 | consumed samples: 3641344 | consumed tokens: 7457472512 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.725928E+00 | loss scale: 2048.0 | grad norm: 5.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.005 | TFLOPs: 42.37 | +[default7]: iteration 3557/ 6200 | consumed samples: 3642368 | consumed tokens: 7459569664 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740039E+00 | loss scale: 2048.0 | grad norm: 5.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.850 | TFLOPs: 42.32 | +[default7]: iteration 3558/ 6200 | consumed samples: 3643392 | consumed tokens: 7461666816 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696628E+00 | loss scale: 2048.0 | grad norm: 5.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.730 | TFLOPs: 42.29 | +[default7]: iteration 3559/ 6200 | consumed samples: 3644416 | consumed tokens: 7463763968 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695248E+00 | loss scale: 2048.0 | grad norm: 6.589 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.691 | TFLOPs: 42.27 | +[default7]: iteration 3560/ 6200 | consumed samples: 3645440 | consumed tokens: 7465861120 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703436E+00 | loss scale: 2048.0 | grad norm: 6.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.895 | TFLOPs: 42.34 | +[default7]: iteration 3561/ 6200 | consumed samples: 3646464 | consumed tokens: 7467958272 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696604E+00 | loss scale: 2048.0 | grad norm: 6.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.704 | TFLOPs: 42.28 | +[default7]: iteration 3562/ 6200 | consumed samples: 3647488 | consumed tokens: 7470055424 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.732566E+00 | loss scale: 2048.0 | grad norm: 6.644 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.554 | TFLOPs: 42.23 | +[default7]: iteration 3563/ 6200 | consumed samples: 3648512 | consumed tokens: 7472152576 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.744917E+00 | loss scale: 2048.0 | grad norm: 6.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.901 | TFLOPs: 42.34 | +[default7]: iteration 3564/ 6200 | consumed samples: 3649536 | consumed tokens: 7474249728 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733855E+00 | loss scale: 2048.0 | grad norm: 5.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.665 | TFLOPs: 42.27 | +[default7]: iteration 3565/ 6200 | consumed samples: 3650560 | consumed tokens: 7476346880 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710483E+00 | loss scale: 2048.0 | grad norm: 6.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.792 | TFLOPs: 42.31 | +[default7]: iteration 3566/ 6200 | consumed samples: 3651584 | consumed tokens: 7478444032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742187E+00 | loss scale: 2048.0 | grad norm: 7.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.920 | TFLOPs: 42.34 | +[default7]: iteration 3567/ 6200 | consumed samples: 3652608 | consumed tokens: 7480541184 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715920E+00 | loss scale: 2048.0 | grad norm: 6.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.601 | TFLOPs: 42.25 | +[default7]: iteration 3568/ 6200 | consumed samples: 3653632 | consumed tokens: 7482638336 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.739814E+00 | loss scale: 2048.0 | grad norm: 6.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.837 | TFLOPs: 42.32 | +[default7]: iteration 3569/ 6200 | consumed samples: 3654656 | consumed tokens: 7484735488 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731542E+00 | loss scale: 2048.0 | grad norm: 5.776 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.769 | TFLOPs: 42.30 | +[default7]: iteration 3570/ 6200 | consumed samples: 3655680 | consumed tokens: 7486832640 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727410E+00 | loss scale: 2048.0 | grad norm: 6.927 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.593 | TFLOPs: 42.24 | +[default7]: iteration 3571/ 6200 | consumed samples: 3656704 | consumed tokens: 7488929792 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.725495E+00 | loss scale: 2048.0 | grad norm: 6.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.832 | TFLOPs: 42.32 | +[default7]: iteration 3572/ 6200 | consumed samples: 3657728 | consumed tokens: 7491026944 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736615E+00 | loss scale: 2048.0 | grad norm: 5.744 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.991 | TFLOPs: 42.37 | +[default7]: iteration 3573/ 6200 | consumed samples: 3658752 | consumed tokens: 7493124096 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746652E+00 | loss scale: 2048.0 | grad norm: 6.702 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.910 | TFLOPs: 42.34 | +[default7]: iteration 3574/ 6200 | consumed samples: 3659776 | consumed tokens: 7495221248 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707224E+00 | loss scale: 2048.0 | grad norm: 6.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.003 | TFLOPs: 42.37 | +[default7]: iteration 3575/ 6200 | consumed samples: 3660800 | consumed tokens: 7497318400 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721903E+00 | loss scale: 2048.0 | grad norm: 5.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.049 | TFLOPs: 42.38 | +[default7]: iteration 3576/ 6200 | consumed samples: 3661824 | consumed tokens: 7499415552 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724332E+00 | loss scale: 2048.0 | grad norm: 6.023 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.624 | TFLOPs: 42.25 | +[default7]: iteration 3577/ 6200 | consumed samples: 3662848 | consumed tokens: 7501512704 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711700E+00 | loss scale: 2048.0 | grad norm: 4.915 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.561 | TFLOPs: 42.23 | +[default7]: iteration 3578/ 6200 | consumed samples: 3663872 | consumed tokens: 7503609856 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733886E+00 | loss scale: 2048.0 | grad norm: 6.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.709 | TFLOPs: 42.28 | +[default7]: iteration 3579/ 6200 | consumed samples: 3664896 | consumed tokens: 7505707008 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716926E+00 | loss scale: 2048.0 | grad norm: 6.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.869 | TFLOPs: 42.33 | +[default7]: iteration 3580/ 6200 | consumed samples: 3665920 | consumed tokens: 7507804160 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713088E+00 | loss scale: 2048.0 | grad norm: 5.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.814 | TFLOPs: 42.31 | +[default7]: iteration 3581/ 6200 | consumed samples: 3666944 | consumed tokens: 7509901312 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.750669E+00 | loss scale: 2048.0 | grad norm: 6.003 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.841 | TFLOPs: 42.32 | +[default7]: iteration 3582/ 6200 | consumed samples: 3667968 | consumed tokens: 7511998464 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706310E+00 | loss scale: 2048.0 | grad norm: 6.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.919 | TFLOPs: 42.34 | +[default7]: iteration 3583/ 6200 | consumed samples: 3668992 | consumed tokens: 7514095616 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.725674E+00 | loss scale: 2048.0 | grad norm: 6.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.743 | TFLOPs: 42.29 | +[default7]: iteration 3584/ 6200 | consumed samples: 3670016 | consumed tokens: 7516192768 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715839E+00 | loss scale: 2048.0 | grad norm: 5.649 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.333 | TFLOPs: 42.17 | +[default7]: iteration 3585/ 6200 | consumed samples: 3671040 | consumed tokens: 7518289920 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702595E+00 | loss scale: 2048.0 | grad norm: 5.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.907 | TFLOPs: 42.34 | +[default7]: iteration 3586/ 6200 | consumed samples: 3672064 | consumed tokens: 7520387072 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724641E+00 | loss scale: 2048.0 | grad norm: 5.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.757 | TFLOPs: 42.29 | +[default7]: iteration 3587/ 6200 | consumed samples: 3673088 | consumed tokens: 7522484224 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685627E+00 | loss scale: 2048.0 | grad norm: 4.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.938 | TFLOPs: 42.35 | +[default7]: iteration 3588/ 6200 | consumed samples: 3674112 | consumed tokens: 7524581376 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746423E+00 | loss scale: 2048.0 | grad norm: 5.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.791 | TFLOPs: 42.30 | +[default7]: iteration 3589/ 6200 | consumed samples: 3675136 | consumed tokens: 7526678528 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.745296E+00 | loss scale: 2048.0 | grad norm: 5.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.949 | TFLOPs: 42.35 | +[default7]: iteration 3590/ 6200 | consumed samples: 3676160 | consumed tokens: 7528775680 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710824E+00 | loss scale: 2048.0 | grad norm: 5.774 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.943 | TFLOPs: 42.35 | +[default7]: iteration 3591/ 6200 | consumed samples: 3677184 | consumed tokens: 7530872832 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719590E+00 | loss scale: 2048.0 | grad norm: 5.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.054 | TFLOPs: 42.38 | +[default7]: iteration 3592/ 6200 | consumed samples: 3678208 | consumed tokens: 7532969984 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742684E+00 | loss scale: 2048.0 | grad norm: 6.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.694 | TFLOPs: 42.28 | +[default7]: iteration 3593/ 6200 | consumed samples: 3679232 | consumed tokens: 7535067136 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698649E+00 | loss scale: 2048.0 | grad norm: 5.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.796 | TFLOPs: 42.31 | +[default7]: iteration 3594/ 6200 | consumed samples: 3680256 | consumed tokens: 7537164288 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.734078E+00 | loss scale: 2048.0 | grad norm: 4.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.785 | TFLOPs: 42.30 | +[default7]: iteration 3595/ 6200 | consumed samples: 3681280 | consumed tokens: 7539261440 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716648E+00 | loss scale: 2048.0 | grad norm: 5.733 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.990 | TFLOPs: 42.37 | +[default7]: iteration 3596/ 6200 | consumed samples: 3682304 | consumed tokens: 7541358592 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722910E+00 | loss scale: 2048.0 | grad norm: 5.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.991 | TFLOPs: 42.37 | +[default7]: iteration 3597/ 6200 | consumed samples: 3683328 | consumed tokens: 7543455744 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724870E+00 | loss scale: 2048.0 | grad norm: 6.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.882 | TFLOPs: 42.33 | +[default7]: iteration 3598/ 6200 | consumed samples: 3684352 | consumed tokens: 7545552896 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723513E+00 | loss scale: 2048.0 | grad norm: 4.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.762 | TFLOPs: 42.30 | +[default7]: iteration 3599/ 6200 | consumed samples: 3685376 | consumed tokens: 7547650048 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726151E+00 | loss scale: 2048.0 | grad norm: 5.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.664 | TFLOPs: 42.27 | +[default7]: iteration 3600/ 6200 | consumed samples: 3686400 | consumed tokens: 7549747200 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.748613E+00 | loss scale: 2048.0 | grad norm: 6.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.305 | TFLOPs: 42.16 | +[default7]: iteration 3601/ 6200 | consumed samples: 3687424 | consumed tokens: 7551844352 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712463E+00 | loss scale: 2048.0 | grad norm: 4.872 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.312 | TFLOPs: 42.16 | +[default7]: iteration 3602/ 6200 | consumed samples: 3688448 | consumed tokens: 7553941504 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735553E+00 | loss scale: 2048.0 | grad norm: 6.634 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.403 | TFLOPs: 42.19 | +[default7]: iteration 3603/ 6200 | consumed samples: 3689472 | consumed tokens: 7556038656 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741271E+00 | loss scale: 2048.0 | grad norm: 5.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.530 | TFLOPs: 42.23 | +[default7]: iteration 3604/ 6200 | consumed samples: 3690496 | consumed tokens: 7558135808 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709571E+00 | loss scale: 2048.0 | grad norm: 5.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.361 | TFLOPs: 42.17 | +[default7]: iteration 3605/ 6200 | consumed samples: 3691520 | consumed tokens: 7560232960 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704328E+00 | loss scale: 2048.0 | grad norm: 5.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.673 | TFLOPs: 42.27 | +[default7]: iteration 3606/ 6200 | consumed samples: 3692544 | consumed tokens: 7562330112 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716890E+00 | loss scale: 2048.0 | grad norm: 7.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.463 | TFLOPs: 42.20 | +[default7]: iteration 3607/ 6200 | consumed samples: 3693568 | consumed tokens: 7564427264 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716898E+00 | loss scale: 2048.0 | grad norm: 5.007 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.500 | TFLOPs: 42.22 | +[default7]: iteration 3608/ 6200 | consumed samples: 3694592 | consumed tokens: 7566524416 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724788E+00 | loss scale: 2048.0 | grad norm: 6.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.416 | TFLOPs: 42.19 | +[default7]: iteration 3609/ 6200 | consumed samples: 3695616 | consumed tokens: 7568621568 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.751635E+00 | loss scale: 2048.0 | grad norm: 5.644 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.404 | TFLOPs: 42.19 | +[default7]: iteration 3610/ 6200 | consumed samples: 3696640 | consumed tokens: 7570718720 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699196E+00 | loss scale: 2048.0 | grad norm: 5.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.422 | TFLOPs: 42.19 | +[default7]: iteration 3611/ 6200 | consumed samples: 3697664 | consumed tokens: 7572815872 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704464E+00 | loss scale: 2048.0 | grad norm: 6.058 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.370 | TFLOPs: 42.18 | +[default7]: iteration 3612/ 6200 | consumed samples: 3698688 | consumed tokens: 7574913024 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714303E+00 | loss scale: 2048.0 | grad norm: 7.712 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.741 | TFLOPs: 42.29 | +[default7]: iteration 3613/ 6200 | consumed samples: 3699712 | consumed tokens: 7577010176 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709833E+00 | loss scale: 2048.0 | grad norm: 6.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.868 | TFLOPs: 42.33 | +[default7]: iteration 3614/ 6200 | consumed samples: 3700736 | consumed tokens: 7579107328 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738148E+00 | loss scale: 2048.0 | grad norm: 5.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.724 | TFLOPs: 42.28 | +[default7]: iteration 3615/ 6200 | consumed samples: 3701760 | consumed tokens: 7581204480 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731792E+00 | loss scale: 2048.0 | grad norm: 5.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.564 | TFLOPs: 42.24 | +[default7]: iteration 3616/ 6200 | consumed samples: 3702784 | consumed tokens: 7583301632 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.730344E+00 | loss scale: 2048.0 | grad norm: 6.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.190 | TFLOPs: 42.12 | +[default7]: iteration 3617/ 6200 | consumed samples: 3703808 | consumed tokens: 7585398784 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711953E+00 | loss scale: 2048.0 | grad norm: 6.024 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.244 | TFLOPs: 42.14 | +[default7]: iteration 3618/ 6200 | consumed samples: 3704832 | consumed tokens: 7587495936 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719721E+00 | loss scale: 2048.0 | grad norm: 5.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.684 | TFLOPs: 42.27 | +[default7]: iteration 3619/ 6200 | consumed samples: 3705856 | consumed tokens: 7589593088 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724543E+00 | loss scale: 2048.0 | grad norm: 6.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.844 | TFLOPs: 42.32 | +[default7]: iteration 3620/ 6200 | consumed samples: 3706880 | consumed tokens: 7591690240 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740257E+00 | loss scale: 2048.0 | grad norm: 6.029 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.596 | TFLOPs: 42.25 | +[default7]: iteration 3621/ 6200 | consumed samples: 3707904 | consumed tokens: 7593787392 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724884E+00 | loss scale: 2048.0 | grad norm: 5.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.621 | TFLOPs: 42.25 | +[default7]: iteration 3622/ 6200 | consumed samples: 3708928 | consumed tokens: 7595884544 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.749818E+00 | loss scale: 2048.0 | grad norm: 5.754 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.435 | TFLOPs: 42.20 | +[default7]: iteration 3623/ 6200 | consumed samples: 3709952 | consumed tokens: 7597981696 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718643E+00 | loss scale: 2048.0 | grad norm: 7.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.681 | TFLOPs: 42.27 | +[default7]: iteration 3624/ 6200 | consumed samples: 3710976 | consumed tokens: 7600078848 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709104E+00 | loss scale: 2048.0 | grad norm: 5.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.619 | TFLOPs: 42.25 | +[default7]: iteration 3625/ 6200 | consumed samples: 3712000 | consumed tokens: 7602176000 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742435E+00 | loss scale: 2048.0 | grad norm: 4.929 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.681 | TFLOPs: 42.27 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 3625 | lm loss value: 3.581334E+00 | lm loss PPL: 3.592144E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 3625 | lm loss value: 1.617525E+00 | lm loss PPL: 5.040602E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 3626/ 6200 | consumed samples: 3713024 | consumed tokens: 7604273152 | elapsed time per iteration (s): 51.69 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707163E+00 | loss scale: 2048.0 | grad norm: 5.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.810 | TFLOPs: 6.04 | +[default7]: iteration 3627/ 6200 | consumed samples: 3714048 | consumed tokens: 7606370304 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695563E+00 | loss scale: 2048.0 | grad norm: 5.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.692 | TFLOPs: 42.27 | +[default7]: iteration 3628/ 6200 | consumed samples: 3715072 | consumed tokens: 7608467456 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735910E+00 | loss scale: 2048.0 | grad norm: 5.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.394 | TFLOPs: 42.18 | +[default7]: iteration 3629/ 6200 | consumed samples: 3716096 | consumed tokens: 7610564608 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719877E+00 | loss scale: 2048.0 | grad norm: 5.626 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.687 | TFLOPs: 42.27 | +[default7]: iteration 3630/ 6200 | consumed samples: 3717120 | consumed tokens: 7612661760 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717680E+00 | loss scale: 2048.0 | grad norm: 5.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.576 | TFLOPs: 41.93 | +[default7]: iteration 3631/ 6200 | consumed samples: 3718144 | consumed tokens: 7614758912 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723496E+00 | loss scale: 2048.0 | grad norm: 6.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.779 | TFLOPs: 42.30 | +[default7]: iteration 3632/ 6200 | consumed samples: 3719168 | consumed tokens: 7616856064 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733443E+00 | loss scale: 2048.0 | grad norm: 6.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.716 | TFLOPs: 42.28 | +[default7]: iteration 3633/ 6200 | consumed samples: 3720192 | consumed tokens: 7618953216 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727047E+00 | loss scale: 2048.0 | grad norm: 5.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.401 | TFLOPs: 42.19 | +[default7]: iteration 3634/ 6200 | consumed samples: 3721216 | consumed tokens: 7621050368 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743954E+00 | loss scale: 2048.0 | grad norm: 4.838 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.519 | TFLOPs: 42.22 | +[default7]: iteration 3635/ 6200 | consumed samples: 3722240 | consumed tokens: 7623147520 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699162E+00 | loss scale: 2048.0 | grad norm: 7.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.609 | TFLOPs: 42.25 | +[default7]: iteration 3636/ 6200 | consumed samples: 3723264 | consumed tokens: 7625244672 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736794E+00 | loss scale: 2048.0 | grad norm: 9.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.604 | TFLOPs: 42.25 | +[default7]: iteration 3637/ 6200 | consumed samples: 3724288 | consumed tokens: 7627341824 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721243E+00 | loss scale: 2048.0 | grad norm: 5.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.639 | TFLOPs: 42.26 | +[default7]: iteration 3638/ 6200 | consumed samples: 3725312 | consumed tokens: 7629438976 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704342E+00 | loss scale: 2048.0 | grad norm: 4.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.639 | TFLOPs: 42.26 | +[default7]: iteration 3639/ 6200 | consumed samples: 3726336 | consumed tokens: 7631536128 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735514E+00 | loss scale: 2048.0 | grad norm: 6.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.644 | TFLOPs: 42.26 | +[default7]: iteration 3640/ 6200 | consumed samples: 3727360 | consumed tokens: 7633633280 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733195E+00 | loss scale: 2048.0 | grad norm: 7.013 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.719 | TFLOPs: 42.28 | +[default7]: iteration 3641/ 6200 | consumed samples: 3728384 | consumed tokens: 7635730432 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735521E+00 | loss scale: 2048.0 | grad norm: 4.910 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.672 | TFLOPs: 42.27 | +[default7]: iteration 3642/ 6200 | consumed samples: 3729408 | consumed tokens: 7637827584 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724676E+00 | loss scale: 2048.0 | grad norm: 5.062 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.062 | TFLOPs: 42.08 | +[default7]: iteration 3643/ 6200 | consumed samples: 3730432 | consumed tokens: 7639924736 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704349E+00 | loss scale: 2048.0 | grad norm: 5.737 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.770 | TFLOPs: 42.30 | +[default7]: iteration 3644/ 6200 | consumed samples: 3731456 | consumed tokens: 7642021888 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719499E+00 | loss scale: 2048.0 | grad norm: 5.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.516 | TFLOPs: 42.22 | +[default7]: iteration 3645/ 6200 | consumed samples: 3732480 | consumed tokens: 7644119040 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711627E+00 | loss scale: 2048.0 | grad norm: 6.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.390 | TFLOPs: 42.18 | +[default7]: iteration 3646/ 6200 | consumed samples: 3733504 | consumed tokens: 7646216192 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726757E+00 | loss scale: 2048.0 | grad norm: 5.743 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.663 | TFLOPs: 42.27 | +[default7]: iteration 3647/ 6200 | consumed samples: 3734528 | consumed tokens: 7648313344 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728138E+00 | loss scale: 2048.0 | grad norm: 5.839 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.644 | TFLOPs: 42.26 | +[default7]: iteration 3648/ 6200 | consumed samples: 3735552 | consumed tokens: 7650410496 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701147E+00 | loss scale: 2048.0 | grad norm: 5.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.671 | TFLOPs: 42.27 | +[default7]: iteration 3649/ 6200 | consumed samples: 3736576 | consumed tokens: 7652507648 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722399E+00 | loss scale: 2048.0 | grad norm: 5.062 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.663 | TFLOPs: 42.27 | +[default7]: iteration 3650/ 6200 | consumed samples: 3737600 | consumed tokens: 7654604800 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752398E+00 | loss scale: 2048.0 | grad norm: 6.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.617 | TFLOPs: 42.25 | +[default7]: iteration 3651/ 6200 | consumed samples: 3738624 | consumed tokens: 7656701952 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.730940E+00 | loss scale: 2048.0 | grad norm: 5.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.731 | TFLOPs: 42.29 | +[default7]: iteration 3652/ 6200 | consumed samples: 3739648 | consumed tokens: 7658799104 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746573E+00 | loss scale: 2048.0 | grad norm: 5.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.648 | TFLOPs: 42.26 | +[default7]: iteration 3653/ 6200 | consumed samples: 3740672 | consumed tokens: 7660896256 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718368E+00 | loss scale: 2048.0 | grad norm: 5.955 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.729 | TFLOPs: 42.29 | +[default7]: iteration 3654/ 6200 | consumed samples: 3741696 | consumed tokens: 7662993408 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686188E+00 | loss scale: 2048.0 | grad norm: 5.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.556 | TFLOPs: 42.23 | +[default7]: iteration 3655/ 6200 | consumed samples: 3742720 | consumed tokens: 7665090560 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679714E+00 | loss scale: 2048.0 | grad norm: 5.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.526 | TFLOPs: 42.22 | +[default7]: iteration 3656/ 6200 | consumed samples: 3743744 | consumed tokens: 7667187712 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683301E+00 | loss scale: 2048.0 | grad norm: 5.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.757 | TFLOPs: 42.29 | +[default7]: iteration 3657/ 6200 | consumed samples: 3744768 | consumed tokens: 7669284864 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697065E+00 | loss scale: 2048.0 | grad norm: 4.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.788 | TFLOPs: 42.30 | +[default7]: iteration 3658/ 6200 | consumed samples: 3745792 | consumed tokens: 7671382016 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722903E+00 | loss scale: 2048.0 | grad norm: 5.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.103 | TFLOPs: 42.40 | +[default7]: iteration 3659/ 6200 | consumed samples: 3746816 | consumed tokens: 7673479168 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733660E+00 | loss scale: 2048.0 | grad norm: 4.910 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.154 | TFLOPs: 42.42 | +[default7]: iteration 3660/ 6200 | consumed samples: 3747840 | consumed tokens: 7675576320 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707147E+00 | loss scale: 2048.0 | grad norm: 5.978 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.017 | TFLOPs: 42.37 | +[default7]: iteration 3661/ 6200 | consumed samples: 3748864 | consumed tokens: 7677673472 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708506E+00 | loss scale: 2048.0 | grad norm: 5.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.002 | TFLOPs: 42.37 | +[default7]: iteration 3662/ 6200 | consumed samples: 3749888 | consumed tokens: 7679770624 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707260E+00 | loss scale: 2048.0 | grad norm: 6.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.124 | TFLOPs: 42.41 | +[default7]: iteration 3663/ 6200 | consumed samples: 3750912 | consumed tokens: 7681867776 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711197E+00 | loss scale: 2048.0 | grad norm: 6.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.156 | TFLOPs: 42.42 | +[default7]: iteration 3664/ 6200 | consumed samples: 3751936 | consumed tokens: 7683964928 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717115E+00 | loss scale: 2048.0 | grad norm: 4.910 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.126 | TFLOPs: 42.41 | +[default7]: iteration 3665/ 6200 | consumed samples: 3752960 | consumed tokens: 7686062080 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726517E+00 | loss scale: 2048.0 | grad norm: 5.672 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.938 | TFLOPs: 42.35 | +[default7]: iteration 3666/ 6200 | consumed samples: 3753984 | consumed tokens: 7688159232 | elapsed time per iteration (s): 7.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718806E+00 | loss scale: 2048.0 | grad norm: 4.688 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.763 | TFLOPs: 41.69 | +[default7]: iteration 3667/ 6200 | consumed samples: 3755008 | consumed tokens: 7690256384 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698413E+00 | loss scale: 2048.0 | grad norm: 5.048 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.819 | TFLOPs: 42.31 | +[default7]: iteration 3668/ 6200 | consumed samples: 3756032 | consumed tokens: 7692353536 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740512E+00 | loss scale: 2048.0 | grad norm: 5.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.111 | TFLOPs: 42.40 | +[default7]: iteration 3669/ 6200 | consumed samples: 3757056 | consumed tokens: 7694450688 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.720896E+00 | loss scale: 2048.0 | grad norm: 6.042 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.254 | TFLOPs: 42.45 | +[default7]: iteration 3670/ 6200 | consumed samples: 3758080 | consumed tokens: 7696547840 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727607E+00 | loss scale: 2048.0 | grad norm: 6.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.357 | TFLOPs: 42.48 | +[default7]: iteration 3671/ 6200 | consumed samples: 3759104 | consumed tokens: 7698644992 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738216E+00 | loss scale: 2048.0 | grad norm: 4.770 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.028 | TFLOPs: 42.38 | +[default7]: iteration 3672/ 6200 | consumed samples: 3760128 | consumed tokens: 7700742144 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710285E+00 | loss scale: 2048.0 | grad norm: 5.646 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.205 | TFLOPs: 42.43 | +[default7]: iteration 3673/ 6200 | consumed samples: 3761152 | consumed tokens: 7702839296 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722113E+00 | loss scale: 2048.0 | grad norm: 6.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.321 | TFLOPs: 42.47 | +[default7]: iteration 3674/ 6200 | consumed samples: 3762176 | consumed tokens: 7704936448 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693070E+00 | loss scale: 2048.0 | grad norm: 5.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.159 | TFLOPs: 42.42 | +[default7]: iteration 3675/ 6200 | consumed samples: 3763200 | consumed tokens: 7707033600 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698992E+00 | loss scale: 2048.0 | grad norm: 6.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.124 | TFLOPs: 42.41 | +[default7]: iteration 3676/ 6200 | consumed samples: 3764224 | consumed tokens: 7709130752 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705610E+00 | loss scale: 2048.0 | grad norm: 6.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.203 | TFLOPs: 42.43 | +[default7]: iteration 3677/ 6200 | consumed samples: 3765248 | consumed tokens: 7711227904 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723456E+00 | loss scale: 2048.0 | grad norm: 5.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.262 | TFLOPs: 42.45 | +[default7]: iteration 3678/ 6200 | consumed samples: 3766272 | consumed tokens: 7713325056 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.725524E+00 | loss scale: 2048.0 | grad norm: 5.857 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.591 | TFLOPs: 41.63 | +[default7]: iteration 3679/ 6200 | consumed samples: 3767296 | consumed tokens: 7715422208 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710857E+00 | loss scale: 2048.0 | grad norm: 4.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.252 | TFLOPs: 42.45 | +[default7]: iteration 3680/ 6200 | consumed samples: 3768320 | consumed tokens: 7717519360 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698586E+00 | loss scale: 2048.0 | grad norm: 7.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.196 | TFLOPs: 42.43 | +[default7]: iteration 3681/ 6200 | consumed samples: 3769344 | consumed tokens: 7719616512 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746919E+00 | loss scale: 2048.0 | grad norm: 6.856 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.291 | TFLOPs: 42.46 | +[default7]: iteration 3682/ 6200 | consumed samples: 3770368 | consumed tokens: 7721713664 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704703E+00 | loss scale: 2048.0 | grad norm: 5.958 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.329 | TFLOPs: 42.47 | +[default7]: iteration 3683/ 6200 | consumed samples: 3771392 | consumed tokens: 7723810816 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727044E+00 | loss scale: 2048.0 | grad norm: 6.058 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.346 | TFLOPs: 42.47 | +[default7]: iteration 3684/ 6200 | consumed samples: 3772416 | consumed tokens: 7725907968 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698839E+00 | loss scale: 2048.0 | grad norm: 9.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.073 | TFLOPs: 42.39 | +[default7]: iteration 3685/ 6200 | consumed samples: 3773440 | consumed tokens: 7728005120 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711619E+00 | loss scale: 2048.0 | grad norm: 6.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.229 | TFLOPs: 42.44 | +[default7]: iteration 3686/ 6200 | consumed samples: 3774464 | consumed tokens: 7730102272 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718674E+00 | loss scale: 2048.0 | grad norm: 5.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.120 | TFLOPs: 42.40 | +[default7]: iteration 3687/ 6200 | consumed samples: 3775488 | consumed tokens: 7732199424 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.756345E+00 | loss scale: 2048.0 | grad norm: 5.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.298 | TFLOPs: 42.46 | +[default7]: iteration 3688/ 6200 | consumed samples: 3776512 | consumed tokens: 7734296576 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721227E+00 | loss scale: 2048.0 | grad norm: 5.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.294 | TFLOPs: 42.46 | +[default7]: iteration 3689/ 6200 | consumed samples: 3777536 | consumed tokens: 7736393728 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.747529E+00 | loss scale: 2048.0 | grad norm: 5.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.187 | TFLOPs: 42.43 | +[default7]: iteration 3690/ 6200 | consumed samples: 3778560 | consumed tokens: 7738490880 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679153E+00 | loss scale: 2048.0 | grad norm: 5.822 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.069 | TFLOPs: 42.39 | +[default7]: iteration 3691/ 6200 | consumed samples: 3779584 | consumed tokens: 7740588032 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700345E+00 | loss scale: 2048.0 | grad norm: 6.007 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.252 | TFLOPs: 42.45 | +[default7]: iteration 3692/ 6200 | consumed samples: 3780608 | consumed tokens: 7742685184 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.743711E+00 | loss scale: 2048.0 | grad norm: 6.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.162 | TFLOPs: 42.42 | +[default7]: iteration 3693/ 6200 | consumed samples: 3781632 | consumed tokens: 7744782336 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.752144E+00 | loss scale: 2048.0 | grad norm: 5.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.374 | TFLOPs: 42.48 | +[default7]: iteration 3694/ 6200 | consumed samples: 3782656 | consumed tokens: 7746879488 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715704E+00 | loss scale: 2048.0 | grad norm: 6.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.194 | TFLOPs: 42.43 | +[default7]: iteration 3695/ 6200 | consumed samples: 3783680 | consumed tokens: 7748976640 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.734110E+00 | loss scale: 2048.0 | grad norm: 5.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.180 | TFLOPs: 42.42 | +[default7]: iteration 3696/ 6200 | consumed samples: 3784704 | consumed tokens: 7751073792 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687048E+00 | loss scale: 2048.0 | grad norm: 6.888 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.163 | TFLOPs: 42.42 | +[default7]: iteration 3697/ 6200 | consumed samples: 3785728 | consumed tokens: 7753170944 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724359E+00 | loss scale: 2048.0 | grad norm: 5.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.144 | TFLOPs: 42.41 | +[default7]: iteration 3698/ 6200 | consumed samples: 3786752 | consumed tokens: 7755268096 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.725840E+00 | loss scale: 2048.0 | grad norm: 5.852 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.283 | TFLOPs: 42.45 | +[default7]: iteration 3699/ 6200 | consumed samples: 3787776 | consumed tokens: 7757365248 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731899E+00 | loss scale: 2048.0 | grad norm: 6.043 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.421 | TFLOPs: 42.50 | +[default7]: iteration 3700/ 6200 | consumed samples: 3788800 | consumed tokens: 7759462400 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719593E+00 | loss scale: 2048.0 | grad norm: 5.065 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.216 | TFLOPs: 42.43 | +[default7]: iteration 3701/ 6200 | consumed samples: 3789824 | consumed tokens: 7761559552 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718702E+00 | loss scale: 2048.0 | grad norm: 6.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.237 | TFLOPs: 42.44 | +[default7]: iteration 3702/ 6200 | consumed samples: 3790848 | consumed tokens: 7763656704 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712086E+00 | loss scale: 2048.0 | grad norm: 4.972 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.001 | TFLOPs: 42.37 | +[default7]: iteration 3703/ 6200 | consumed samples: 3791872 | consumed tokens: 7765753856 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690294E+00 | loss scale: 2048.0 | grad norm: 5.756 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.325 | TFLOPs: 42.47 | +[default7]: iteration 3704/ 6200 | consumed samples: 3792896 | consumed tokens: 7767851008 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683338E+00 | loss scale: 2048.0 | grad norm: 5.618 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.316 | TFLOPs: 42.46 | +[default7]: iteration 3705/ 6200 | consumed samples: 3793920 | consumed tokens: 7769948160 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.745606E+00 | loss scale: 2048.0 | grad norm: 5.764 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.256 | TFLOPs: 42.45 | +[default7]: iteration 3706/ 6200 | consumed samples: 3794944 | consumed tokens: 7772045312 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713417E+00 | loss scale: 2048.0 | grad norm: 5.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.277 | TFLOPs: 42.45 | +[default7]: iteration 3707/ 6200 | consumed samples: 3795968 | consumed tokens: 7774142464 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.729235E+00 | loss scale: 2048.0 | grad norm: 5.912 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.303 | TFLOPs: 42.46 | +[default7]: iteration 3708/ 6200 | consumed samples: 3796992 | consumed tokens: 7776239616 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700625E+00 | loss scale: 2048.0 | grad norm: 5.936 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.155 | TFLOPs: 42.42 | +[default7]: iteration 3709/ 6200 | consumed samples: 3798016 | consumed tokens: 7778336768 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713671E+00 | loss scale: 2048.0 | grad norm: 6.764 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.329 | TFLOPs: 42.47 | +[default7]: iteration 3710/ 6200 | consumed samples: 3799040 | consumed tokens: 7780433920 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688891E+00 | loss scale: 2048.0 | grad norm: 5.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.199 | TFLOPs: 42.43 | +[default7]: iteration 3711/ 6200 | consumed samples: 3800064 | consumed tokens: 7782531072 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721321E+00 | loss scale: 2048.0 | grad norm: 7.045 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.222 | TFLOPs: 42.44 | +[default7]: iteration 3712/ 6200 | consumed samples: 3801088 | consumed tokens: 7784628224 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731833E+00 | loss scale: 2048.0 | grad norm: 4.959 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.296 | TFLOPs: 42.46 | +[default7]: iteration 3713/ 6200 | consumed samples: 3802112 | consumed tokens: 7786725376 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708611E+00 | loss scale: 2048.0 | grad norm: 5.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.164 | TFLOPs: 42.42 | +[default7]: iteration 3714/ 6200 | consumed samples: 3803136 | consumed tokens: 7788822528 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733283E+00 | loss scale: 2048.0 | grad norm: 5.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.859 | TFLOPs: 42.33 | +[default7]: iteration 3715/ 6200 | consumed samples: 3804160 | consumed tokens: 7790919680 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705133E+00 | loss scale: 2048.0 | grad norm: 4.737 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.134 | TFLOPs: 42.41 | +[default7]: iteration 3716/ 6200 | consumed samples: 3805184 | consumed tokens: 7793016832 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719562E+00 | loss scale: 2048.0 | grad norm: 5.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.298 | TFLOPs: 42.46 | +[default7]: iteration 3717/ 6200 | consumed samples: 3806208 | consumed tokens: 7795113984 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711854E+00 | loss scale: 2048.0 | grad norm: 4.947 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.456 | TFLOPs: 42.51 | +[default7]: iteration 3718/ 6200 | consumed samples: 3807232 | consumed tokens: 7797211136 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707544E+00 | loss scale: 2048.0 | grad norm: 5.607 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.248 | TFLOPs: 42.44 | +[default7]: iteration 3719/ 6200 | consumed samples: 3808256 | consumed tokens: 7799308288 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721545E+00 | loss scale: 2048.0 | grad norm: 5.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.275 | TFLOPs: 42.45 | +[default7]: iteration 3720/ 6200 | consumed samples: 3809280 | consumed tokens: 7801405440 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.736314E+00 | loss scale: 2048.0 | grad norm: 4.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.180 | TFLOPs: 42.42 | +[default7]: iteration 3721/ 6200 | consumed samples: 3810304 | consumed tokens: 7803502592 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712876E+00 | loss scale: 2048.0 | grad norm: 5.891 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.315 | TFLOPs: 42.46 | +[default7]: iteration 3722/ 6200 | consumed samples: 3811328 | consumed tokens: 7805599744 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.732388E+00 | loss scale: 2048.0 | grad norm: 6.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.382 | TFLOPs: 42.48 | +[default7]: iteration 3723/ 6200 | consumed samples: 3812352 | consumed tokens: 7807696896 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705634E+00 | loss scale: 2048.0 | grad norm: 5.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.182 | TFLOPs: 42.42 | +[default7]: iteration 3724/ 6200 | consumed samples: 3813376 | consumed tokens: 7809794048 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706258E+00 | loss scale: 2048.0 | grad norm: 6.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.345 | TFLOPs: 42.47 | +[default7]: iteration 3725/ 6200 | consumed samples: 3814400 | consumed tokens: 7811891200 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717840E+00 | loss scale: 2048.0 | grad norm: 6.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.179 | TFLOPs: 42.42 | +[default7]: iteration 3726/ 6200 | consumed samples: 3815424 | consumed tokens: 7813988352 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.729391E+00 | loss scale: 2048.0 | grad norm: 6.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.366 | TFLOPs: 42.48 | +[default7]: iteration 3727/ 6200 | consumed samples: 3816448 | consumed tokens: 7816085504 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700439E+00 | loss scale: 2048.0 | grad norm: 5.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.359 | TFLOPs: 42.48 | +[default7]: iteration 3728/ 6200 | consumed samples: 3817472 | consumed tokens: 7818182656 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698357E+00 | loss scale: 2048.0 | grad norm: 7.749 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.384 | TFLOPs: 42.49 | +[default7]: iteration 3729/ 6200 | consumed samples: 3818496 | consumed tokens: 7820279808 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726616E+00 | loss scale: 2048.0 | grad norm: 6.765 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.230 | TFLOPs: 42.44 | +[default7]: iteration 3730/ 6200 | consumed samples: 3819520 | consumed tokens: 7822376960 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.734697E+00 | loss scale: 2048.0 | grad norm: 5.041 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.373 | TFLOPs: 42.48 | +[default7]: iteration 3731/ 6200 | consumed samples: 3820544 | consumed tokens: 7824474112 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701293E+00 | loss scale: 2048.0 | grad norm: 6.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.297 | TFLOPs: 42.46 | +[default7]: iteration 3732/ 6200 | consumed samples: 3821568 | consumed tokens: 7826571264 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721953E+00 | loss scale: 2048.0 | grad norm: 6.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.191 | TFLOPs: 42.43 | +[default7]: iteration 3733/ 6200 | consumed samples: 3822592 | consumed tokens: 7828668416 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710134E+00 | loss scale: 2048.0 | grad norm: 5.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.392 | TFLOPs: 42.49 | +[default7]: iteration 3734/ 6200 | consumed samples: 3823616 | consumed tokens: 7830765568 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718951E+00 | loss scale: 2048.0 | grad norm: 6.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.328 | TFLOPs: 42.47 | +[default7]: iteration 3735/ 6200 | consumed samples: 3824640 | consumed tokens: 7832862720 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.741169E+00 | loss scale: 2048.0 | grad norm: 5.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.409 | TFLOPs: 42.49 | +[default7]: iteration 3736/ 6200 | consumed samples: 3825664 | consumed tokens: 7834959872 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719966E+00 | loss scale: 2048.0 | grad norm: 6.002 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.265 | TFLOPs: 42.45 | +[default7]: iteration 3737/ 6200 | consumed samples: 3826688 | consumed tokens: 7837057024 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693336E+00 | loss scale: 2048.0 | grad norm: 5.750 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.235 | TFLOPs: 42.44 | +[default7]: iteration 3738/ 6200 | consumed samples: 3827712 | consumed tokens: 7839154176 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731361E+00 | loss scale: 2048.0 | grad norm: 5.010 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.005 | TFLOPs: 42.37 | +[default7]: iteration 3739/ 6200 | consumed samples: 3828736 | consumed tokens: 7841251328 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703426E+00 | loss scale: 2048.0 | grad norm: 5.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.462 | TFLOPs: 42.51 | +[default7]: iteration 3740/ 6200 | consumed samples: 3829760 | consumed tokens: 7843348480 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716155E+00 | loss scale: 2048.0 | grad norm: 5.017 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.382 | TFLOPs: 42.48 | +[default7]: iteration 3741/ 6200 | consumed samples: 3830784 | consumed tokens: 7845445632 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706940E+00 | loss scale: 2048.0 | grad norm: 4.804 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.088 | TFLOPs: 42.40 | +[default7]: iteration 3742/ 6200 | consumed samples: 3831808 | consumed tokens: 7847542784 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.729153E+00 | loss scale: 2048.0 | grad norm: 5.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.482 | TFLOPs: 42.52 | +[default7]: iteration 3743/ 6200 | consumed samples: 3832832 | consumed tokens: 7849639936 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718897E+00 | loss scale: 2048.0 | grad norm: 5.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.112 | TFLOPs: 42.40 | +[default7]: iteration 3744/ 6200 | consumed samples: 3833856 | consumed tokens: 7851737088 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.746827E+00 | loss scale: 2048.0 | grad norm: 5.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.975 | TFLOPs: 42.36 | +[default7]: iteration 3745/ 6200 | consumed samples: 3834880 | consumed tokens: 7853834240 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.684910E+00 | loss scale: 2048.0 | grad norm: 4.878 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.899 | TFLOPs: 42.34 | +[default7]: iteration 3746/ 6200 | consumed samples: 3835904 | consumed tokens: 7855931392 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702641E+00 | loss scale: 2048.0 | grad norm: 5.829 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.982 | TFLOPs: 42.36 | +[default7]: iteration 3747/ 6200 | consumed samples: 3836928 | consumed tokens: 7858028544 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712340E+00 | loss scale: 2048.0 | grad norm: 5.660 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.140 | TFLOPs: 42.41 | +[default7]: iteration 3748/ 6200 | consumed samples: 3837952 | consumed tokens: 7860125696 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713949E+00 | loss scale: 2048.0 | grad norm: 5.651 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.414 | TFLOPs: 42.49 | +[default7]: iteration 3749/ 6200 | consumed samples: 3838976 | consumed tokens: 7862222848 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709076E+00 | loss scale: 2048.0 | grad norm: 5.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.486 | TFLOPs: 42.52 | +[default7]: iteration 3750/ 6200 | consumed samples: 3840000 | consumed tokens: 7864320000 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.732198E+00 | loss scale: 2048.0 | grad norm: 5.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.237 | TFLOPs: 42.44 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 3750 | lm loss value: 3.590754E+00 | lm loss PPL: 3.626141E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 3750 | lm loss value: 1.609931E+00 | lm loss PPL: 5.002467E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 3750 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 20:47:42,292] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3750 is begin to save! +[default0]:[2022-10-06 20:47:42,297] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:42,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:42,700] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:42,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:42,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:42,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:42,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:42,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:42,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:42,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:42,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:42,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:42,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:42,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:42,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:42,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:42,907] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:42,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:42,934] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:42,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:42,962] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:42,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:42,989] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,073] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,102] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,136] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,166] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,221] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,311] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,340] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,373] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_26-model_00-model_states.pt... +[default2]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 20:47:43,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,400] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 20:47:43,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 20:47:43,402] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/mp_rank_00_model_states.pt +[default0]:[2022-10-06 20:47:43,402] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 20:47:43,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 20:47:43,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 20:47:43,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 20:47:43,610] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 20:47:43,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 20:47:43,583] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 20:47:43,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 20:47:43,655] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 20:47:43,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 20:47:43,614] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 20:47:43,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 20:47:43,671] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 20:47:43,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 20:47:43,718] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 20:47:43,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 20:47:43,679] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 20:47:43,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 20:47:43,724] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 20:47:43,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 20:47:43,701] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 20:47:43,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 20:47:43,718] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 20:47:43,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 20:47:43,685] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 20:47:43,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 20:47:43,719] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 20:47:43,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 20:47:43,732] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 20:47:43,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 20:47:43,740] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 20:47:43,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 20:47:43,712] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 20:47:43,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 20:47:43,726] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 20:47:43,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 20:47:43,722] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 20:47:43,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 20:47:43,734] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 20:47:43,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 20:47:43,726] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 20:47:43,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 20:47:43,784] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 20:47:43,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 20:47:43,743] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 20:47:43,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 20:47:43,783] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 20:47:43,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 20:47:43,761] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 20:47:43,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 20:47:43,799] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default5]:[2022-10-06 20:47:43,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 20:47:43,730] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default4]:[2022-10-06 20:47:43,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 20:47:43,723] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default2]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default1]:[2022-10-06 20:47:43,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 20:47:43,725] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default7]:[2022-10-06 20:47:43,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 20:47:43,759] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default7]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default7]:time (ms) | save-checkpoint: 1511.74 +[default1]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default3]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default1]:[2022-10-06 20:47:43,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 20:47:43,753] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default2]:[2022-10-06 20:47:43,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 20:47:43,745] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default6]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default4]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default0]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default6]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default2]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default6]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default7]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default4]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default5]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default0]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default4]:[2022-10-06 20:47:43,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 20:47:43,801] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default5]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default7]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default3]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default2]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default3]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default6]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default0]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default0]: successfully saved checkpoint at iteration 3750 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 20:47:43,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 20:47:43,802] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step3750/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default1]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default3]:[2022-10-06 20:47:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3750 is ready now! +[default7]: iteration 3751/ 6200 | consumed samples: 3841024 | consumed tokens: 7866417152 | elapsed time per iteration (s): 54.32 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735245E+00 | loss scale: 2048.0 | grad norm: 4.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 18.850 | TFLOPs: 5.75 | +[default7]: iteration 3752/ 6200 | consumed samples: 3842048 | consumed tokens: 7868514304 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708343E+00 | loss scale: 2048.0 | grad norm: 5.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.056 | TFLOPs: 42.39 | +[default7]: iteration 3753/ 6200 | consumed samples: 3843072 | consumed tokens: 7870611456 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689116E+00 | loss scale: 2048.0 | grad norm: 5.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.055 | TFLOPs: 42.39 | +[default7]: iteration 3754/ 6200 | consumed samples: 3844096 | consumed tokens: 7872708608 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717914E+00 | loss scale: 2048.0 | grad norm: 5.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.988 | TFLOPs: 42.36 | +[default7]: iteration 3755/ 6200 | consumed samples: 3845120 | consumed tokens: 7874805760 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693347E+00 | loss scale: 2048.0 | grad norm: 5.813 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.082 | TFLOPs: 42.39 | +[default7]: iteration 3756/ 6200 | consumed samples: 3846144 | consumed tokens: 7876902912 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668483E+00 | loss scale: 2048.0 | grad norm: 5.021 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.982 | TFLOPs: 42.36 | +[default7]: iteration 3757/ 6200 | consumed samples: 3847168 | consumed tokens: 7879000064 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707656E+00 | loss scale: 2048.0 | grad norm: 5.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.283 | TFLOPs: 42.45 | +[default7]: iteration 3758/ 6200 | consumed samples: 3848192 | consumed tokens: 7881097216 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742677E+00 | loss scale: 2048.0 | grad norm: 5.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.407 | TFLOPs: 42.49 | +[default7]: iteration 3759/ 6200 | consumed samples: 3849216 | consumed tokens: 7883194368 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717630E+00 | loss scale: 2048.0 | grad norm: 5.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.896 | TFLOPs: 42.34 | +[default7]: iteration 3760/ 6200 | consumed samples: 3850240 | consumed tokens: 7885291520 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708617E+00 | loss scale: 2048.0 | grad norm: 4.883 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.099 | TFLOPs: 42.40 | +[default7]: iteration 3761/ 6200 | consumed samples: 3851264 | consumed tokens: 7887388672 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715659E+00 | loss scale: 2048.0 | grad norm: 5.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.324 | TFLOPs: 42.47 | +[default7]: iteration 3762/ 6200 | consumed samples: 3852288 | consumed tokens: 7889485824 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.729095E+00 | loss scale: 2048.0 | grad norm: 5.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.071 | TFLOPs: 42.39 | +[default7]: iteration 3763/ 6200 | consumed samples: 3853312 | consumed tokens: 7891582976 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718696E+00 | loss scale: 2048.0 | grad norm: 6.776 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.916 | TFLOPs: 42.34 | +[default7]: iteration 3764/ 6200 | consumed samples: 3854336 | consumed tokens: 7893680128 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.684252E+00 | loss scale: 2048.0 | grad norm: 5.612 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.289 | TFLOPs: 42.46 | +[default7]: iteration 3765/ 6200 | consumed samples: 3855360 | consumed tokens: 7895777280 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712105E+00 | loss scale: 2048.0 | grad norm: 5.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.316 | TFLOPs: 42.46 | +[default7]: iteration 3766/ 6200 | consumed samples: 3856384 | consumed tokens: 7897874432 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718670E+00 | loss scale: 2048.0 | grad norm: 5.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.238 | TFLOPs: 42.44 | +[default7]: iteration 3767/ 6200 | consumed samples: 3857408 | consumed tokens: 7899971584 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681423E+00 | loss scale: 2048.0 | grad norm: 5.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.246 | TFLOPs: 42.44 | +[default7]: iteration 3768/ 6200 | consumed samples: 3858432 | consumed tokens: 7902068736 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709867E+00 | loss scale: 2048.0 | grad norm: 5.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.364 | TFLOPs: 42.48 | +[default7]: iteration 3769/ 6200 | consumed samples: 3859456 | consumed tokens: 7904165888 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716069E+00 | loss scale: 2048.0 | grad norm: 6.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.254 | TFLOPs: 42.45 | +[default7]: iteration 3770/ 6200 | consumed samples: 3860480 | consumed tokens: 7906263040 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708349E+00 | loss scale: 2048.0 | grad norm: 8.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.312 | TFLOPs: 42.46 | +[default7]: iteration 3771/ 6200 | consumed samples: 3861504 | consumed tokens: 7908360192 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719940E+00 | loss scale: 2048.0 | grad norm: 5.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.837 | TFLOPs: 42.32 | +[default7]: iteration 3772/ 6200 | consumed samples: 3862528 | consumed tokens: 7910457344 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699859E+00 | loss scale: 2048.0 | grad norm: 5.833 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.034 | TFLOPs: 42.38 | +[default7]: iteration 3773/ 6200 | consumed samples: 3863552 | consumed tokens: 7912554496 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716391E+00 | loss scale: 2048.0 | grad norm: 5.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.962 | TFLOPs: 42.36 | +[default7]: iteration 3774/ 6200 | consumed samples: 3864576 | consumed tokens: 7914651648 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722480E+00 | loss scale: 2048.0 | grad norm: 5.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.192 | TFLOPs: 42.12 | +[default7]: iteration 3775/ 6200 | consumed samples: 3865600 | consumed tokens: 7916748800 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.740654E+00 | loss scale: 2048.0 | grad norm: 4.996 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.986 | TFLOPs: 42.36 | +[default7]: iteration 3776/ 6200 | consumed samples: 3866624 | consumed tokens: 7918845952 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693660E+00 | loss scale: 2048.0 | grad norm: 5.022 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.577 | TFLOPs: 42.24 | +[default7]: iteration 3777/ 6200 | consumed samples: 3867648 | consumed tokens: 7920943104 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696414E+00 | loss scale: 2048.0 | grad norm: 5.063 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.702 | TFLOPs: 42.28 | +[default7]: iteration 3778/ 6200 | consumed samples: 3868672 | consumed tokens: 7923040256 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714060E+00 | loss scale: 2048.0 | grad norm: 4.713 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.786 | TFLOPs: 42.30 | +[default7]: iteration 3779/ 6200 | consumed samples: 3869696 | consumed tokens: 7925137408 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694358E+00 | loss scale: 2048.0 | grad norm: 5.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.632 | TFLOPs: 42.26 | +[default7]: iteration 3780/ 6200 | consumed samples: 3870720 | consumed tokens: 7927234560 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696264E+00 | loss scale: 2048.0 | grad norm: 4.817 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.632 | TFLOPs: 42.26 | +[default7]: iteration 3781/ 6200 | consumed samples: 3871744 | consumed tokens: 7929331712 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.732510E+00 | loss scale: 2048.0 | grad norm: 5.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.474 | TFLOPs: 42.21 | +[default7]: iteration 3782/ 6200 | consumed samples: 3872768 | consumed tokens: 7931428864 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712263E+00 | loss scale: 2048.0 | grad norm: 5.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.697 | TFLOPs: 42.28 | +[default7]: iteration 3783/ 6200 | consumed samples: 3873792 | consumed tokens: 7933526016 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724178E+00 | loss scale: 2048.0 | grad norm: 6.007 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.831 | TFLOPs: 42.32 | +[default7]: iteration 3784/ 6200 | consumed samples: 3874816 | consumed tokens: 7935623168 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719660E+00 | loss scale: 2048.0 | grad norm: 6.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.048 | TFLOPs: 42.38 | +[default7]: iteration 3785/ 6200 | consumed samples: 3875840 | consumed tokens: 7937720320 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688900E+00 | loss scale: 2048.0 | grad norm: 4.957 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.069 | TFLOPs: 42.39 | +[default7]: iteration 3786/ 6200 | consumed samples: 3876864 | consumed tokens: 7939817472 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717319E+00 | loss scale: 2048.0 | grad norm: 5.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.026 | TFLOPs: 42.38 | +[default7]: iteration 3787/ 6200 | consumed samples: 3877888 | consumed tokens: 7941914624 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718544E+00 | loss scale: 2048.0 | grad norm: 6.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.872 | TFLOPs: 42.33 | +[default7]: iteration 3788/ 6200 | consumed samples: 3878912 | consumed tokens: 7944011776 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712755E+00 | loss scale: 2048.0 | grad norm: 5.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.642 | TFLOPs: 42.26 | +[default7]: iteration 3789/ 6200 | consumed samples: 3879936 | consumed tokens: 7946108928 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701713E+00 | loss scale: 2048.0 | grad norm: 5.927 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.225 | TFLOPs: 42.44 | +[default7]: iteration 3790/ 6200 | consumed samples: 3880960 | consumed tokens: 7948206080 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717607E+00 | loss scale: 2048.0 | grad norm: 5.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.775 | TFLOPs: 42.30 | +[default7]: iteration 3791/ 6200 | consumed samples: 3881984 | consumed tokens: 7950303232 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709511E+00 | loss scale: 2048.0 | grad norm: 7.917 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.080 | TFLOPs: 42.39 | +[default7]: iteration 3792/ 6200 | consumed samples: 3883008 | consumed tokens: 7952400384 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682391E+00 | loss scale: 2048.0 | grad norm: 5.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.654 | TFLOPs: 42.26 | +[default7]: iteration 3793/ 6200 | consumed samples: 3884032 | consumed tokens: 7954497536 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715792E+00 | loss scale: 2048.0 | grad norm: 4.950 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.819 | TFLOPs: 42.31 | +[default7]: iteration 3794/ 6200 | consumed samples: 3885056 | consumed tokens: 7956594688 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718265E+00 | loss scale: 2048.0 | grad norm: 6.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.794 | TFLOPs: 42.31 | +[default7]: iteration 3795/ 6200 | consumed samples: 3886080 | consumed tokens: 7958691840 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705011E+00 | loss scale: 2048.0 | grad norm: 5.683 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.888 | TFLOPs: 42.33 | +[default7]: iteration 3796/ 6200 | consumed samples: 3887104 | consumed tokens: 7960788992 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674345E+00 | loss scale: 2048.0 | grad norm: 5.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.032 | TFLOPs: 42.38 | +[default7]: iteration 3797/ 6200 | consumed samples: 3888128 | consumed tokens: 7962886144 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.738659E+00 | loss scale: 2048.0 | grad norm: 5.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.039 | TFLOPs: 42.38 | +[default7]: iteration 3798/ 6200 | consumed samples: 3889152 | consumed tokens: 7964983296 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723521E+00 | loss scale: 2048.0 | grad norm: 5.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.801 | TFLOPs: 42.31 | +[default7]: iteration 3799/ 6200 | consumed samples: 3890176 | consumed tokens: 7967080448 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713768E+00 | loss scale: 2048.0 | grad norm: 4.853 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.817 | TFLOPs: 42.31 | +[default7]: iteration 3800/ 6200 | consumed samples: 3891200 | consumed tokens: 7969177600 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694639E+00 | loss scale: 2048.0 | grad norm: 5.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.059 | TFLOPs: 42.39 | +[default7]: iteration 3801/ 6200 | consumed samples: 3892224 | consumed tokens: 7971274752 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735083E+00 | loss scale: 2048.0 | grad norm: 6.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.865 | TFLOPs: 42.33 | +[default7]: iteration 3802/ 6200 | consumed samples: 3893248 | consumed tokens: 7973371904 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699201E+00 | loss scale: 2048.0 | grad norm: 6.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.385 | TFLOPs: 42.49 | +[default7]: iteration 3803/ 6200 | consumed samples: 3894272 | consumed tokens: 7975469056 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680429E+00 | loss scale: 2048.0 | grad norm: 5.717 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.926 | TFLOPs: 42.35 | +[default7]: iteration 3804/ 6200 | consumed samples: 3895296 | consumed tokens: 7977566208 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723318E+00 | loss scale: 2048.0 | grad norm: 4.902 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.099 | TFLOPs: 42.40 | +[default7]: iteration 3805/ 6200 | consumed samples: 3896320 | consumed tokens: 7979663360 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710350E+00 | loss scale: 2048.0 | grad norm: 6.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.016 | TFLOPs: 42.37 | +[default7]: iteration 3806/ 6200 | consumed samples: 3897344 | consumed tokens: 7981760512 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694800E+00 | loss scale: 2048.0 | grad norm: 6.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.936 | TFLOPs: 42.35 | +[default7]: iteration 3807/ 6200 | consumed samples: 3898368 | consumed tokens: 7983857664 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.734377E+00 | loss scale: 2048.0 | grad norm: 5.715 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.924 | TFLOPs: 42.35 | +[default7]: iteration 3808/ 6200 | consumed samples: 3899392 | consumed tokens: 7985954816 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735754E+00 | loss scale: 2048.0 | grad norm: 5.034 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.190 | TFLOPs: 42.43 | +[default7]: iteration 3809/ 6200 | consumed samples: 3900416 | consumed tokens: 7988051968 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692425E+00 | loss scale: 2048.0 | grad norm: 4.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.228 | TFLOPs: 42.44 | +[default7]: iteration 3810/ 6200 | consumed samples: 3901440 | consumed tokens: 7990149120 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708111E+00 | loss scale: 2048.0 | grad norm: 5.791 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.813 | TFLOPs: 42.31 | +[default7]: iteration 3811/ 6200 | consumed samples: 3902464 | consumed tokens: 7992246272 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713464E+00 | loss scale: 2048.0 | grad norm: 6.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.907 | TFLOPs: 42.34 | +[default7]: iteration 3812/ 6200 | consumed samples: 3903488 | consumed tokens: 7994343424 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692943E+00 | loss scale: 2048.0 | grad norm: 5.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.009 | TFLOPs: 42.37 | +[default7]: iteration 3813/ 6200 | consumed samples: 3904512 | consumed tokens: 7996440576 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710890E+00 | loss scale: 2048.0 | grad norm: 5.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.314 | TFLOPs: 42.46 | +[default7]: iteration 3814/ 6200 | consumed samples: 3905536 | consumed tokens: 7998537728 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688848E+00 | loss scale: 2048.0 | grad norm: 5.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.406 | TFLOPs: 42.49 | +[default7]: iteration 3815/ 6200 | consumed samples: 3906560 | consumed tokens: 8000634880 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717708E+00 | loss scale: 2048.0 | grad norm: 5.749 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.760 | TFLOPs: 42.30 | +[default7]: iteration 3816/ 6200 | consumed samples: 3907584 | consumed tokens: 8002732032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688738E+00 | loss scale: 2048.0 | grad norm: 5.606 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.906 | TFLOPs: 42.34 | +[default7]: iteration 3817/ 6200 | consumed samples: 3908608 | consumed tokens: 8004829184 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711339E+00 | loss scale: 2048.0 | grad norm: 5.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.973 | TFLOPs: 42.36 | +[default7]: iteration 3818/ 6200 | consumed samples: 3909632 | consumed tokens: 8006926336 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697822E+00 | loss scale: 2048.0 | grad norm: 5.075 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.684 | TFLOPs: 42.27 | +[default7]: iteration 3819/ 6200 | consumed samples: 3910656 | consumed tokens: 8009023488 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709698E+00 | loss scale: 2048.0 | grad norm: 4.665 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.637 | TFLOPs: 42.26 | +[default7]: iteration 3820/ 6200 | consumed samples: 3911680 | consumed tokens: 8011120640 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716563E+00 | loss scale: 2048.0 | grad norm: 4.967 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.121 | TFLOPs: 42.41 | +[default7]: iteration 3821/ 6200 | consumed samples: 3912704 | consumed tokens: 8013217792 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707724E+00 | loss scale: 2048.0 | grad norm: 5.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.075 | TFLOPs: 42.39 | +[default7]: iteration 3822/ 6200 | consumed samples: 3913728 | consumed tokens: 8015314944 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682818E+00 | loss scale: 2048.0 | grad norm: 5.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.917 | TFLOPs: 42.34 | +[default7]: iteration 3823/ 6200 | consumed samples: 3914752 | consumed tokens: 8017412096 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694579E+00 | loss scale: 2048.0 | grad norm: 4.874 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.088 | TFLOPs: 42.40 | +[default7]: iteration 3824/ 6200 | consumed samples: 3915776 | consumed tokens: 8019509248 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705206E+00 | loss scale: 2048.0 | grad norm: 5.900 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.061 | TFLOPs: 42.39 | +[default7]: iteration 3825/ 6200 | consumed samples: 3916800 | consumed tokens: 8021606400 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703422E+00 | loss scale: 2048.0 | grad norm: 5.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.862 | TFLOPs: 42.33 | +[default7]: iteration 3826/ 6200 | consumed samples: 3917824 | consumed tokens: 8023703552 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695991E+00 | loss scale: 2048.0 | grad norm: 5.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.941 | TFLOPs: 42.35 | +[default7]: iteration 3827/ 6200 | consumed samples: 3918848 | consumed tokens: 8025800704 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695786E+00 | loss scale: 2048.0 | grad norm: 5.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.735 | TFLOPs: 42.29 | +[default7]: iteration 3828/ 6200 | consumed samples: 3919872 | consumed tokens: 8027897856 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710762E+00 | loss scale: 2048.0 | grad norm: 4.969 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.571 | TFLOPs: 42.24 | +[default7]: iteration 3829/ 6200 | consumed samples: 3920896 | consumed tokens: 8029995008 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719108E+00 | loss scale: 2048.0 | grad norm: 5.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.989 | TFLOPs: 42.37 | +[default7]: iteration 3830/ 6200 | consumed samples: 3921920 | consumed tokens: 8032092160 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714514E+00 | loss scale: 2048.0 | grad norm: 5.012 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.606 | TFLOPs: 42.25 | +[default7]: iteration 3831/ 6200 | consumed samples: 3922944 | consumed tokens: 8034189312 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.720877E+00 | loss scale: 2048.0 | grad norm: 5.575 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.657 | TFLOPs: 42.26 | +[default7]: iteration 3832/ 6200 | consumed samples: 3923968 | consumed tokens: 8036286464 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707198E+00 | loss scale: 2048.0 | grad norm: 4.999 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.989 | TFLOPs: 42.37 | +[default7]: iteration 3833/ 6200 | consumed samples: 3924992 | consumed tokens: 8038383616 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674020E+00 | loss scale: 2048.0 | grad norm: 5.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.892 | TFLOPs: 42.34 | +[default7]: iteration 3834/ 6200 | consumed samples: 3926016 | consumed tokens: 8040480768 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692809E+00 | loss scale: 2048.0 | grad norm: 4.886 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.725 | TFLOPs: 42.28 | +[default7]: iteration 3835/ 6200 | consumed samples: 3927040 | consumed tokens: 8042577920 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714001E+00 | loss scale: 2048.0 | grad norm: 4.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.898 | TFLOPs: 42.34 | +[default7]: iteration 3836/ 6200 | consumed samples: 3928064 | consumed tokens: 8044675072 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695345E+00 | loss scale: 2048.0 | grad norm: 5.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.045 | TFLOPs: 42.38 | +[default7]: iteration 3837/ 6200 | consumed samples: 3929088 | consumed tokens: 8046772224 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693195E+00 | loss scale: 2048.0 | grad norm: 4.720 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.836 | TFLOPs: 42.32 | +[default7]: iteration 3838/ 6200 | consumed samples: 3930112 | consumed tokens: 8048869376 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.725159E+00 | loss scale: 2048.0 | grad norm: 5.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.193 | TFLOPs: 42.43 | +[default7]: iteration 3839/ 6200 | consumed samples: 3931136 | consumed tokens: 8050966528 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711701E+00 | loss scale: 2048.0 | grad norm: 5.831 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.020 | TFLOPs: 42.37 | +[default7]: iteration 3840/ 6200 | consumed samples: 3932160 | consumed tokens: 8053063680 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.742953E+00 | loss scale: 2048.0 | grad norm: 5.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.809 | TFLOPs: 42.31 | +[default7]: iteration 3841/ 6200 | consumed samples: 3933184 | consumed tokens: 8055160832 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714002E+00 | loss scale: 2048.0 | grad norm: 5.030 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.664 | TFLOPs: 42.27 | +[default7]: iteration 3842/ 6200 | consumed samples: 3934208 | consumed tokens: 8057257984 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708724E+00 | loss scale: 2048.0 | grad norm: 5.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.098 | TFLOPs: 42.40 | +[default7]: iteration 3843/ 6200 | consumed samples: 3935232 | consumed tokens: 8059355136 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717564E+00 | loss scale: 2048.0 | grad norm: 5.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.150 | TFLOPs: 42.41 | +[default7]: iteration 3844/ 6200 | consumed samples: 3936256 | consumed tokens: 8061452288 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715305E+00 | loss scale: 2048.0 | grad norm: 4.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.739 | TFLOPs: 42.29 | +[default7]: iteration 3845/ 6200 | consumed samples: 3937280 | consumed tokens: 8063549440 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691854E+00 | loss scale: 2048.0 | grad norm: 5.029 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.867 | TFLOPs: 42.33 | +[default7]: iteration 3846/ 6200 | consumed samples: 3938304 | consumed tokens: 8065646592 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703690E+00 | loss scale: 2048.0 | grad norm: 5.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.003 | TFLOPs: 42.37 | +[default7]: iteration 3847/ 6200 | consumed samples: 3939328 | consumed tokens: 8067743744 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695390E+00 | loss scale: 2048.0 | grad norm: 4.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.075 | TFLOPs: 42.39 | +[default7]: iteration 3848/ 6200 | consumed samples: 3940352 | consumed tokens: 8069840896 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680754E+00 | loss scale: 2048.0 | grad norm: 6.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.732 | TFLOPs: 42.29 | +[default7]: iteration 3849/ 6200 | consumed samples: 3941376 | consumed tokens: 8071938048 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716262E+00 | loss scale: 2048.0 | grad norm: 7.913 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.626 | TFLOPs: 42.25 | +[default7]: iteration 3850/ 6200 | consumed samples: 3942400 | consumed tokens: 8074035200 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.720546E+00 | loss scale: 2048.0 | grad norm: 4.993 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.926 | TFLOPs: 42.35 | +[default7]: iteration 3851/ 6200 | consumed samples: 3943424 | consumed tokens: 8076132352 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705760E+00 | loss scale: 2048.0 | grad norm: 5.717 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.073 | TFLOPs: 42.39 | +[default7]: iteration 3852/ 6200 | consumed samples: 3944448 | consumed tokens: 8078229504 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704927E+00 | loss scale: 2048.0 | grad norm: 5.951 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.133 | TFLOPs: 42.41 | +[default7]: iteration 3853/ 6200 | consumed samples: 3945472 | consumed tokens: 8080326656 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703792E+00 | loss scale: 2048.0 | grad norm: 5.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.957 | TFLOPs: 42.36 | +[default7]: iteration 3854/ 6200 | consumed samples: 3946496 | consumed tokens: 8082423808 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.732970E+00 | loss scale: 2048.0 | grad norm: 6.766 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.040 | TFLOPs: 42.38 | +[default7]: iteration 3855/ 6200 | consumed samples: 3947520 | consumed tokens: 8084520960 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679320E+00 | loss scale: 2048.0 | grad norm: 5.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.620 | TFLOPs: 42.25 | +[default7]: iteration 3856/ 6200 | consumed samples: 3948544 | consumed tokens: 8086618112 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715179E+00 | loss scale: 2048.0 | grad norm: 6.710 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.780 | TFLOPs: 42.30 | +[default7]: iteration 3857/ 6200 | consumed samples: 3949568 | consumed tokens: 8088715264 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713579E+00 | loss scale: 2048.0 | grad norm: 6.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.817 | TFLOPs: 42.31 | +[default7]: iteration 3858/ 6200 | consumed samples: 3950592 | consumed tokens: 8090812416 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735129E+00 | loss scale: 2048.0 | grad norm: 5.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.917 | TFLOPs: 42.34 | +[default7]: iteration 3859/ 6200 | consumed samples: 3951616 | consumed tokens: 8092909568 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709950E+00 | loss scale: 2048.0 | grad norm: 5.933 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.487 | TFLOPs: 42.21 | +[default7]: iteration 3860/ 6200 | consumed samples: 3952640 | consumed tokens: 8095006720 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695164E+00 | loss scale: 2048.0 | grad norm: 5.875 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.017 | TFLOPs: 42.37 | +[default7]: iteration 3861/ 6200 | consumed samples: 3953664 | consumed tokens: 8097103872 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727373E+00 | loss scale: 2048.0 | grad norm: 5.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.808 | TFLOPs: 42.31 | +[default7]: iteration 3862/ 6200 | consumed samples: 3954688 | consumed tokens: 8099201024 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705503E+00 | loss scale: 4096.0 | grad norm: 3.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.709 | TFLOPs: 42.28 | +[default7]: iteration 3863/ 6200 | consumed samples: 3955712 | consumed tokens: 8101298176 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.718316E+00 | loss scale: 4096.0 | grad norm: 6.901 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.344 | TFLOPs: 42.17 | +[default7]: iteration 3864/ 6200 | consumed samples: 3956736 | consumed tokens: 8103395328 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689429E+00 | loss scale: 4096.0 | grad norm: 4.770 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.859 | TFLOPs: 42.33 | +[default7]: iteration 3865/ 6200 | consumed samples: 3957760 | consumed tokens: 8105492480 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728698E+00 | loss scale: 4096.0 | grad norm: 5.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.135 | TFLOPs: 42.41 | +[default7]: iteration 3866/ 6200 | consumed samples: 3958784 | consumed tokens: 8107589632 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717220E+00 | loss scale: 4096.0 | grad norm: 5.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.797 | TFLOPs: 42.31 | +[default7]: iteration 3867/ 6200 | consumed samples: 3959808 | consumed tokens: 8109686784 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.720361E+00 | loss scale: 4096.0 | grad norm: 5.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.848 | TFLOPs: 42.32 | +[default7]: iteration 3868/ 6200 | consumed samples: 3960832 | consumed tokens: 8111783936 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706675E+00 | loss scale: 4096.0 | grad norm: 5.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.045 | TFLOPs: 42.38 | +[default7]: iteration 3869/ 6200 | consumed samples: 3961856 | consumed tokens: 8113881088 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731706E+00 | loss scale: 4096.0 | grad norm: 5.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.971 | TFLOPs: 42.36 | +[default7]: iteration 3870/ 6200 | consumed samples: 3962880 | consumed tokens: 8115978240 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683956E+00 | loss scale: 4096.0 | grad norm: 5.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.977 | TFLOPs: 42.36 | +[default7]: iteration 3871/ 6200 | consumed samples: 3963904 | consumed tokens: 8118075392 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700253E+00 | loss scale: 4096.0 | grad norm: 5.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.968 | TFLOPs: 42.36 | +[default7]: iteration 3872/ 6200 | consumed samples: 3964928 | consumed tokens: 8120172544 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719808E+00 | loss scale: 4096.0 | grad norm: 5.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.209 | TFLOPs: 42.43 | +[default7]: iteration 3873/ 6200 | consumed samples: 3965952 | consumed tokens: 8122269696 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699063E+00 | loss scale: 4096.0 | grad norm: 7.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.255 | TFLOPs: 42.45 | +[default7]: iteration 3874/ 6200 | consumed samples: 3966976 | consumed tokens: 8124366848 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685265E+00 | loss scale: 4096.0 | grad norm: 6.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.105 | TFLOPs: 42.40 | +[default7]: iteration 3875/ 6200 | consumed samples: 3968000 | consumed tokens: 8126464000 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.731176E+00 | loss scale: 4096.0 | grad norm: 5.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.027 | TFLOPs: 42.38 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 3875 | lm loss value: 3.586642E+00 | lm loss PPL: 3.611260E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 3875 | lm loss value: 1.602036E+00 | lm loss PPL: 4.963125E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 3876/ 6200 | consumed samples: 3969024 | consumed tokens: 8128561152 | elapsed time per iteration (s): 51.72 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713804E+00 | loss scale: 4096.0 | grad norm: 5.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.799 | TFLOPs: 6.03 | +[default7]: iteration 3877/ 6200 | consumed samples: 3970048 | consumed tokens: 8130658304 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694518E+00 | loss scale: 4096.0 | grad norm: 5.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.659 | TFLOPs: 42.26 | +[default7]: iteration 3878/ 6200 | consumed samples: 3971072 | consumed tokens: 8132755456 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713740E+00 | loss scale: 4096.0 | grad norm: 4.875 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.004 | TFLOPs: 42.37 | +[default7]: iteration 3879/ 6200 | consumed samples: 3972096 | consumed tokens: 8134852608 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695521E+00 | loss scale: 4096.0 | grad norm: 5.822 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.992 | TFLOPs: 42.37 | +[default7]: iteration 3880/ 6200 | consumed samples: 3973120 | consumed tokens: 8136949760 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676322E+00 | loss scale: 4096.0 | grad norm: 5.771 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.592 | TFLOPs: 42.24 | +[default7]: iteration 3881/ 6200 | consumed samples: 3974144 | consumed tokens: 8139046912 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708489E+00 | loss scale: 4096.0 | grad norm: 5.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.725 | TFLOPs: 42.28 | +[default7]: iteration 3882/ 6200 | consumed samples: 3975168 | consumed tokens: 8141144064 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692791E+00 | loss scale: 4096.0 | grad norm: 5.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.546 | TFLOPs: 42.23 | +[default7]: iteration 3883/ 6200 | consumed samples: 3976192 | consumed tokens: 8143241216 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706210E+00 | loss scale: 4096.0 | grad norm: 7.682 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.397 | TFLOPs: 42.18 | +[default7]: iteration 3884/ 6200 | consumed samples: 3977216 | consumed tokens: 8145338368 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700557E+00 | loss scale: 4096.0 | grad norm: 5.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.438 | TFLOPs: 42.20 | +[default7]: iteration 3885/ 6200 | consumed samples: 3978240 | consumed tokens: 8147435520 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673987E+00 | loss scale: 4096.0 | grad norm: 5.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.306 | TFLOPs: 42.16 | +[default7]: iteration 3886/ 6200 | consumed samples: 3979264 | consumed tokens: 8149532672 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.766785E+00 | loss scale: 4096.0 | grad norm: 7.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.217 | TFLOPs: 42.13 | +[default7]: iteration 3887/ 6200 | consumed samples: 3980288 | consumed tokens: 8151629824 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.725287E+00 | loss scale: 4096.0 | grad norm: 7.714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.635 | TFLOPs: 42.26 | +[default7]: iteration 3888/ 6200 | consumed samples: 3981312 | consumed tokens: 8153726976 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694480E+00 | loss scale: 4096.0 | grad norm: 5.829 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.032 | TFLOPs: 42.38 | +[default7]: iteration 3889/ 6200 | consumed samples: 3982336 | consumed tokens: 8155824128 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697043E+00 | loss scale: 4096.0 | grad norm: 5.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.576 | TFLOPs: 42.24 | +[default7]: iteration 3890/ 6200 | consumed samples: 3983360 | consumed tokens: 8157921280 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717865E+00 | loss scale: 4096.0 | grad norm: 5.970 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.038 | TFLOPs: 42.38 | +[default7]: iteration 3891/ 6200 | consumed samples: 3984384 | consumed tokens: 8160018432 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698287E+00 | loss scale: 4096.0 | grad norm: 6.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.255 | TFLOPs: 42.45 | +[default7]: iteration 3892/ 6200 | consumed samples: 3985408 | consumed tokens: 8162115584 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675895E+00 | loss scale: 4096.0 | grad norm: 5.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.850 | TFLOPs: 42.32 | +[default7]: iteration 3893/ 6200 | consumed samples: 3986432 | consumed tokens: 8164212736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703662E+00 | loss scale: 4096.0 | grad norm: 7.054 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.647 | TFLOPs: 42.26 | +[default7]: iteration 3894/ 6200 | consumed samples: 3987456 | consumed tokens: 8166309888 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710429E+00 | loss scale: 4096.0 | grad norm: 6.573 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.834 | TFLOPs: 42.32 | +[default7]: iteration 3895/ 6200 | consumed samples: 3988480 | consumed tokens: 8168407040 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698389E+00 | loss scale: 4096.0 | grad norm: 6.952 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.662 | TFLOPs: 42.27 | +[default7]: iteration 3896/ 6200 | consumed samples: 3989504 | consumed tokens: 8170504192 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716612E+00 | loss scale: 4096.0 | grad norm: 6.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.696 | TFLOPs: 42.28 | +[default7]: iteration 3897/ 6200 | consumed samples: 3990528 | consumed tokens: 8172601344 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688820E+00 | loss scale: 4096.0 | grad norm: 5.728 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.610 | TFLOPs: 42.25 | +[default7]: iteration 3898/ 6200 | consumed samples: 3991552 | consumed tokens: 8174698496 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692406E+00 | loss scale: 4096.0 | grad norm: 5.559 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.662 | TFLOPs: 42.27 | +[default7]: iteration 3899/ 6200 | consumed samples: 3992576 | consumed tokens: 8176795648 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691961E+00 | loss scale: 4096.0 | grad norm: 5.933 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.663 | TFLOPs: 42.27 | +[default7]: iteration 3900/ 6200 | consumed samples: 3993600 | consumed tokens: 8178892800 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.732251E+00 | loss scale: 4096.0 | grad norm: 4.626 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.412 | TFLOPs: 42.19 | +[default7]: iteration 3901/ 6200 | consumed samples: 3994624 | consumed tokens: 8180989952 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705933E+00 | loss scale: 4096.0 | grad norm: 6.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.546 | TFLOPs: 42.23 | +[default7]: iteration 3902/ 6200 | consumed samples: 3995648 | consumed tokens: 8183087104 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719175E+00 | loss scale: 4096.0 | grad norm: 5.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.802 | TFLOPs: 42.31 | +[default7]: iteration 3903/ 6200 | consumed samples: 3996672 | consumed tokens: 8185184256 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689347E+00 | loss scale: 4096.0 | grad norm: 5.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.076 | TFLOPs: 42.39 | +[default7]: iteration 3904/ 6200 | consumed samples: 3997696 | consumed tokens: 8187281408 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721673E+00 | loss scale: 4096.0 | grad norm: 5.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.977 | TFLOPs: 42.36 | +[default7]: iteration 3905/ 6200 | consumed samples: 3998720 | consumed tokens: 8189378560 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697939E+00 | loss scale: 4096.0 | grad norm: 5.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.081 | TFLOPs: 42.39 | +[default7]: iteration 3906/ 6200 | consumed samples: 3999744 | consumed tokens: 8191475712 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.728768E+00 | loss scale: 4096.0 | grad norm: 4.729 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.132 | TFLOPs: 42.41 | +[default7]: iteration 3907/ 6200 | consumed samples: 4000768 | consumed tokens: 8193572864 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707428E+00 | loss scale: 4096.0 | grad norm: 5.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.146 | TFLOPs: 42.41 | +[default7]: iteration 3908/ 6200 | consumed samples: 4001792 | consumed tokens: 8195670016 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705930E+00 | loss scale: 4096.0 | grad norm: 4.866 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.012 | TFLOPs: 42.37 | +[default7]: iteration 3909/ 6200 | consumed samples: 4002816 | consumed tokens: 8197767168 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706927E+00 | loss scale: 4096.0 | grad norm: 5.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.054 | TFLOPs: 42.38 | +[default7]: iteration 3910/ 6200 | consumed samples: 4003840 | consumed tokens: 8199864320 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694279E+00 | loss scale: 4096.0 | grad norm: 5.825 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.704 | TFLOPs: 42.28 | +[default7]: iteration 3911/ 6200 | consumed samples: 4004864 | consumed tokens: 8201961472 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691547E+00 | loss scale: 4096.0 | grad norm: 5.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.899 | TFLOPs: 42.34 | +[default7]: iteration 3912/ 6200 | consumed samples: 4005888 | consumed tokens: 8204058624 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687145E+00 | loss scale: 4096.0 | grad norm: 5.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.867 | TFLOPs: 42.33 | +[default7]: iteration 3913/ 6200 | consumed samples: 4006912 | consumed tokens: 8206155776 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721150E+00 | loss scale: 4096.0 | grad norm: 5.718 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.022 | TFLOPs: 42.37 | +[default7]: iteration 3914/ 6200 | consumed samples: 4007936 | consumed tokens: 8208252928 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710770E+00 | loss scale: 4096.0 | grad norm: 6.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.693 | TFLOPs: 42.27 | +[default7]: iteration 3915/ 6200 | consumed samples: 4008960 | consumed tokens: 8210350080 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727463E+00 | loss scale: 4096.0 | grad norm: 6.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.951 | TFLOPs: 42.35 | +[default7]: iteration 3916/ 6200 | consumed samples: 4009984 | consumed tokens: 8212447232 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694764E+00 | loss scale: 4096.0 | grad norm: 5.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.899 | TFLOPs: 42.34 | +[default7]: iteration 3917/ 6200 | consumed samples: 4011008 | consumed tokens: 8214544384 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714040E+00 | loss scale: 4096.0 | grad norm: 4.949 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.990 | TFLOPs: 42.37 | +[default7]: iteration 3918/ 6200 | consumed samples: 4012032 | consumed tokens: 8216641536 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712626E+00 | loss scale: 4096.0 | grad norm: 5.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.060 | TFLOPs: 42.39 | +[default7]: iteration 3919/ 6200 | consumed samples: 4013056 | consumed tokens: 8218738688 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696360E+00 | loss scale: 4096.0 | grad norm: 5.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.978 | TFLOPs: 42.36 | +[default7]: iteration 3920/ 6200 | consumed samples: 4014080 | consumed tokens: 8220835840 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710215E+00 | loss scale: 4096.0 | grad norm: 5.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.045 | TFLOPs: 42.38 | +[default7]: iteration 3921/ 6200 | consumed samples: 4015104 | consumed tokens: 8222932992 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713890E+00 | loss scale: 4096.0 | grad norm: 5.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.981 | TFLOPs: 42.36 | +[default7]: iteration 3922/ 6200 | consumed samples: 4016128 | consumed tokens: 8225030144 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719138E+00 | loss scale: 4096.0 | grad norm: 5.874 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.121 | TFLOPs: 41.80 | +[default7]: iteration 3923/ 6200 | consumed samples: 4017152 | consumed tokens: 8227127296 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.735525E+00 | loss scale: 4096.0 | grad norm: 5.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.622 | TFLOPs: 42.25 | +[default7]: iteration 3924/ 6200 | consumed samples: 4018176 | consumed tokens: 8229224448 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696019E+00 | loss scale: 4096.0 | grad norm: 5.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.036 | TFLOPs: 42.38 | +[default7]: iteration 3925/ 6200 | consumed samples: 4019200 | consumed tokens: 8231321600 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711395E+00 | loss scale: 4096.0 | grad norm: 6.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.873 | TFLOPs: 42.33 | +[default7]: iteration 3926/ 6200 | consumed samples: 4020224 | consumed tokens: 8233418752 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706210E+00 | loss scale: 4096.0 | grad norm: 5.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.871 | TFLOPs: 42.33 | +[default7]: iteration 3927/ 6200 | consumed samples: 4021248 | consumed tokens: 8235515904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702448E+00 | loss scale: 4096.0 | grad norm: 5.780 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.846 | TFLOPs: 42.32 | +[default7]: iteration 3928/ 6200 | consumed samples: 4022272 | consumed tokens: 8237613056 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704209E+00 | loss scale: 4096.0 | grad norm: 5.031 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.745 | TFLOPs: 42.29 | +[default7]: iteration 3929/ 6200 | consumed samples: 4023296 | consumed tokens: 8239710208 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682893E+00 | loss scale: 4096.0 | grad norm: 5.075 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.896 | TFLOPs: 42.34 | +[default7]: iteration 3930/ 6200 | consumed samples: 4024320 | consumed tokens: 8241807360 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.720386E+00 | loss scale: 4096.0 | grad norm: 6.788 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.569 | TFLOPs: 42.24 | +[default7]: iteration 3931/ 6200 | consumed samples: 4025344 | consumed tokens: 8243904512 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710329E+00 | loss scale: 4096.0 | grad norm: 4.937 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.802 | TFLOPs: 42.31 | +[default7]: iteration 3932/ 6200 | consumed samples: 4026368 | consumed tokens: 8246001664 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699942E+00 | loss scale: 4096.0 | grad norm: 5.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.827 | TFLOPs: 42.32 | +[default7]: iteration 3933/ 6200 | consumed samples: 4027392 | consumed tokens: 8248098816 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694695E+00 | loss scale: 4096.0 | grad norm: 5.017 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.542 | TFLOPs: 42.23 | +[default7]: iteration 3934/ 6200 | consumed samples: 4028416 | consumed tokens: 8250195968 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713556E+00 | loss scale: 4096.0 | grad norm: 5.045 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.335 | TFLOPs: 42.17 | +[default7]: iteration 3935/ 6200 | consumed samples: 4029440 | consumed tokens: 8252293120 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694080E+00 | loss scale: 4096.0 | grad norm: 5.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.806 | TFLOPs: 42.31 | +[default7]: iteration 3936/ 6200 | consumed samples: 4030464 | consumed tokens: 8254390272 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716367E+00 | loss scale: 4096.0 | grad norm: 6.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.903 | TFLOPs: 42.34 | +[default7]: iteration 3937/ 6200 | consumed samples: 4031488 | consumed tokens: 8256487424 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716321E+00 | loss scale: 4096.0 | grad norm: 4.929 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.907 | TFLOPs: 42.34 | +[default7]: iteration 3938/ 6200 | consumed samples: 4032512 | consumed tokens: 8258584576 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711371E+00 | loss scale: 4096.0 | grad norm: 5.741 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.679 | TFLOPs: 42.27 | +[default7]: iteration 3939/ 6200 | consumed samples: 4033536 | consumed tokens: 8260681728 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700678E+00 | loss scale: 4096.0 | grad norm: 4.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.086 | TFLOPs: 42.39 | +[default7]: iteration 3940/ 6200 | consumed samples: 4034560 | consumed tokens: 8262778880 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.737841E+00 | loss scale: 4096.0 | grad norm: 5.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.790 | TFLOPs: 42.30 | +[default7]: iteration 3941/ 6200 | consumed samples: 4035584 | consumed tokens: 8264876032 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703417E+00 | loss scale: 4096.0 | grad norm: 5.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.185 | TFLOPs: 42.42 | +[default7]: iteration 3942/ 6200 | consumed samples: 4036608 | consumed tokens: 8266973184 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699879E+00 | loss scale: 4096.0 | grad norm: 5.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.099 | TFLOPs: 42.40 | +[default7]: iteration 3943/ 6200 | consumed samples: 4037632 | consumed tokens: 8269070336 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693286E+00 | loss scale: 4096.0 | grad norm: 5.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.038 | TFLOPs: 42.38 | +[default7]: iteration 3944/ 6200 | consumed samples: 4038656 | consumed tokens: 8271167488 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694412E+00 | loss scale: 4096.0 | grad norm: 6.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.715 | TFLOPs: 42.28 | +[default7]: iteration 3945/ 6200 | consumed samples: 4039680 | consumed tokens: 8273264640 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723981E+00 | loss scale: 4096.0 | grad norm: 5.030 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.780 | TFLOPs: 42.30 | +[default7]: iteration 3946/ 6200 | consumed samples: 4040704 | consumed tokens: 8275361792 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707221E+00 | loss scale: 4096.0 | grad norm: 6.045 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.812 | TFLOPs: 42.31 | +[default7]: iteration 3947/ 6200 | consumed samples: 4041728 | consumed tokens: 8277458944 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697558E+00 | loss scale: 4096.0 | grad norm: 5.864 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.810 | TFLOPs: 42.31 | +[default7]: iteration 3948/ 6200 | consumed samples: 4042752 | consumed tokens: 8279556096 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.684738E+00 | loss scale: 4096.0 | grad norm: 6.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.779 | TFLOPs: 42.30 | +[default7]: iteration 3949/ 6200 | consumed samples: 4043776 | consumed tokens: 8281653248 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685719E+00 | loss scale: 4096.0 | grad norm: 5.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.984 | TFLOPs: 42.36 | +[default7]: iteration 3950/ 6200 | consumed samples: 4044800 | consumed tokens: 8283750400 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702140E+00 | loss scale: 4096.0 | grad norm: 6.869 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.011 | TFLOPs: 42.37 | +[default7]: iteration 3951/ 6200 | consumed samples: 4045824 | consumed tokens: 8285847552 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717130E+00 | loss scale: 4096.0 | grad norm: 6.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.775 | TFLOPs: 42.30 | +[default7]: iteration 3952/ 6200 | consumed samples: 4046848 | consumed tokens: 8287944704 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698782E+00 | loss scale: 4096.0 | grad norm: 4.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.855 | TFLOPs: 42.32 | +[default7]: iteration 3953/ 6200 | consumed samples: 4047872 | consumed tokens: 8290041856 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691112E+00 | loss scale: 4096.0 | grad norm: 6.840 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.040 | TFLOPs: 42.38 | +[default7]: iteration 3954/ 6200 | consumed samples: 4048896 | consumed tokens: 8292139008 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697737E+00 | loss scale: 4096.0 | grad norm: 5.941 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.170 | TFLOPs: 42.42 | +[default7]: iteration 3955/ 6200 | consumed samples: 4049920 | consumed tokens: 8294236160 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692297E+00 | loss scale: 4096.0 | grad norm: 7.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.380 | TFLOPs: 42.18 | +[default7]: iteration 3956/ 6200 | consumed samples: 4050944 | consumed tokens: 8296333312 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689905E+00 | loss scale: 4096.0 | grad norm: 6.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.552 | TFLOPs: 42.23 | +[default7]: iteration 3957/ 6200 | consumed samples: 4051968 | consumed tokens: 8298430464 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709783E+00 | loss scale: 4096.0 | grad norm: 5.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.876 | TFLOPs: 42.33 | +[default7]: iteration 3958/ 6200 | consumed samples: 4052992 | consumed tokens: 8300527616 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703064E+00 | loss scale: 4096.0 | grad norm: 7.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.047 | TFLOPs: 42.38 | +[default7]: iteration 3959/ 6200 | consumed samples: 4054016 | consumed tokens: 8302624768 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705305E+00 | loss scale: 4096.0 | grad norm: 7.842 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.205 | TFLOPs: 42.43 | +[default7]: iteration 3960/ 6200 | consumed samples: 4055040 | consumed tokens: 8304721920 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712114E+00 | loss scale: 4096.0 | grad norm: 6.955 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.000 | TFLOPs: 42.37 | +[default7]: iteration 3961/ 6200 | consumed samples: 4056064 | consumed tokens: 8306819072 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.720758E+00 | loss scale: 4096.0 | grad norm: 5.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.098 | TFLOPs: 42.40 | +[default7]: iteration 3962/ 6200 | consumed samples: 4057088 | consumed tokens: 8308916224 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695423E+00 | loss scale: 4096.0 | grad norm: 6.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.259 | TFLOPs: 42.45 | +[default7]: iteration 3963/ 6200 | consumed samples: 4058112 | consumed tokens: 8311013376 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712859E+00 | loss scale: 4096.0 | grad norm: 6.077 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.507 | TFLOPs: 42.22 | +[default7]: iteration 3964/ 6200 | consumed samples: 4059136 | consumed tokens: 8313110528 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703594E+00 | loss scale: 4096.0 | grad norm: 5.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.494 | TFLOPs: 42.21 | +[default7]: iteration 3965/ 6200 | consumed samples: 4060160 | consumed tokens: 8315207680 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695056E+00 | loss scale: 4096.0 | grad norm: 5.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.524 | TFLOPs: 42.22 | +[default7]: iteration 3966/ 6200 | consumed samples: 4061184 | consumed tokens: 8317304832 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702121E+00 | loss scale: 4096.0 | grad norm: 5.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.917 | TFLOPs: 42.34 | +[default7]: iteration 3967/ 6200 | consumed samples: 4062208 | consumed tokens: 8319401984 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705235E+00 | loss scale: 4096.0 | grad norm: 6.042 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.573 | TFLOPs: 42.24 | +[default7]: iteration 3968/ 6200 | consumed samples: 4063232 | consumed tokens: 8321499136 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700946E+00 | loss scale: 4096.0 | grad norm: 5.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.554 | TFLOPs: 42.23 | +[default7]: iteration 3969/ 6200 | consumed samples: 4064256 | consumed tokens: 8323596288 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700737E+00 | loss scale: 4096.0 | grad norm: 6.012 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.158 | TFLOPs: 42.42 | +[default7]: iteration 3970/ 6200 | consumed samples: 4065280 | consumed tokens: 8325693440 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707888E+00 | loss scale: 4096.0 | grad norm: 5.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.658 | TFLOPs: 41.96 | +[default7]: iteration 3971/ 6200 | consumed samples: 4066304 | consumed tokens: 8327790592 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691561E+00 | loss scale: 4096.0 | grad norm: 5.772 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.117 | TFLOPs: 42.40 | +[default7]: iteration 3972/ 6200 | consumed samples: 4067328 | consumed tokens: 8329887744 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705639E+00 | loss scale: 4096.0 | grad norm: 6.701 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.207 | TFLOPs: 42.43 | +[default7]: iteration 3973/ 6200 | consumed samples: 4068352 | consumed tokens: 8331984896 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.733469E+00 | loss scale: 4096.0 | grad norm: 5.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.200 | TFLOPs: 42.43 | +[default7]: iteration 3974/ 6200 | consumed samples: 4069376 | consumed tokens: 8334082048 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687363E+00 | loss scale: 4096.0 | grad norm: 5.938 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.041 | TFLOPs: 42.38 | +[default7]: iteration 3975/ 6200 | consumed samples: 4070400 | consumed tokens: 8336179200 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710030E+00 | loss scale: 4096.0 | grad norm: 6.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.486 | TFLOPs: 42.21 | +[default7]: iteration 3976/ 6200 | consumed samples: 4071424 | consumed tokens: 8338276352 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681855E+00 | loss scale: 4096.0 | grad norm: 5.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.424 | TFLOPs: 42.19 | +[default7]: iteration 3977/ 6200 | consumed samples: 4072448 | consumed tokens: 8340373504 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.720751E+00 | loss scale: 4096.0 | grad norm: 5.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.147 | TFLOPs: 42.11 | +[default7]: iteration 3978/ 6200 | consumed samples: 4073472 | consumed tokens: 8342470656 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697208E+00 | loss scale: 4096.0 | grad norm: 4.889 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.338 | TFLOPs: 42.17 | +[default7]: iteration 3979/ 6200 | consumed samples: 4074496 | consumed tokens: 8344567808 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708985E+00 | loss scale: 4096.0 | grad norm: 5.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.483 | TFLOPs: 42.21 | +[default7]: iteration 3980/ 6200 | consumed samples: 4075520 | consumed tokens: 8346664960 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678014E+00 | loss scale: 4096.0 | grad norm: 7.032 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.884 | TFLOPs: 42.33 | +[default7]: iteration 3981/ 6200 | consumed samples: 4076544 | consumed tokens: 8348762112 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700400E+00 | loss scale: 4096.0 | grad norm: 4.935 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.225 | TFLOPs: 42.44 | +[default7]: iteration 3982/ 6200 | consumed samples: 4077568 | consumed tokens: 8350859264 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711836E+00 | loss scale: 4096.0 | grad norm: 5.892 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.793 | TFLOPs: 42.31 | +[default7]: iteration 3983/ 6200 | consumed samples: 4078592 | consumed tokens: 8352956416 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699395E+00 | loss scale: 4096.0 | grad norm: 5.975 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.910 | TFLOPs: 42.34 | +[default7]: iteration 3984/ 6200 | consumed samples: 4079616 | consumed tokens: 8355053568 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691442E+00 | loss scale: 4096.0 | grad norm: 7.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.937 | TFLOPs: 42.35 | +[default7]: iteration 3985/ 6200 | consumed samples: 4080640 | consumed tokens: 8357150720 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716744E+00 | loss scale: 4096.0 | grad norm: 6.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.050 | TFLOPs: 42.38 | +[default7]: iteration 3986/ 6200 | consumed samples: 4081664 | consumed tokens: 8359247872 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682335E+00 | loss scale: 4096.0 | grad norm: 5.739 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.061 | TFLOPs: 42.39 | +[default7]: iteration 3987/ 6200 | consumed samples: 4082688 | consumed tokens: 8361345024 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705572E+00 | loss scale: 4096.0 | grad norm: 4.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.033 | TFLOPs: 42.38 | +[default7]: iteration 3988/ 6200 | consumed samples: 4083712 | consumed tokens: 8363442176 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710633E+00 | loss scale: 4096.0 | grad norm: 5.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.094 | TFLOPs: 42.40 | +[default7]: iteration 3989/ 6200 | consumed samples: 4084736 | consumed tokens: 8365539328 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699121E+00 | loss scale: 4096.0 | grad norm: 5.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.056 | TFLOPs: 42.39 | +[default7]: iteration 3990/ 6200 | consumed samples: 4085760 | consumed tokens: 8367636480 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662593E+00 | loss scale: 4096.0 | grad norm: 5.780 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.229 | TFLOPs: 42.44 | +[default7]: iteration 3991/ 6200 | consumed samples: 4086784 | consumed tokens: 8369733632 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689797E+00 | loss scale: 4096.0 | grad norm: 6.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.167 | TFLOPs: 42.42 | +[default7]: iteration 3992/ 6200 | consumed samples: 4087808 | consumed tokens: 8371830784 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681928E+00 | loss scale: 4096.0 | grad norm: 5.950 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.057 | TFLOPs: 42.39 | +[default7]: iteration 3993/ 6200 | consumed samples: 4088832 | consumed tokens: 8373927936 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705525E+00 | loss scale: 4096.0 | grad norm: 6.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.014 | TFLOPs: 42.37 | +[default7]: iteration 3994/ 6200 | consumed samples: 4089856 | consumed tokens: 8376025088 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694376E+00 | loss scale: 4096.0 | grad norm: 4.992 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.113 | TFLOPs: 42.40 | +[default7]: iteration 3995/ 6200 | consumed samples: 4090880 | consumed tokens: 8378122240 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715203E+00 | loss scale: 4096.0 | grad norm: 5.841 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.359 | TFLOPs: 42.48 | +[default7]: iteration 3996/ 6200 | consumed samples: 4091904 | consumed tokens: 8380219392 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693607E+00 | loss scale: 4096.0 | grad norm: 6.077 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.314 | TFLOPs: 42.46 | +[default7]: iteration 3997/ 6200 | consumed samples: 4092928 | consumed tokens: 8382316544 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705928E+00 | loss scale: 4096.0 | grad norm: 5.879 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.970 | TFLOPs: 42.36 | +[default7]: iteration 3998/ 6200 | consumed samples: 4093952 | consumed tokens: 8384413696 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692283E+00 | loss scale: 4096.0 | grad norm: 5.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.232 | TFLOPs: 42.44 | +[default7]: iteration 3999/ 6200 | consumed samples: 4094976 | consumed tokens: 8386510848 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714691E+00 | loss scale: 4096.0 | grad norm: 5.912 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.718 | TFLOPs: 42.28 | +[default7]: iteration 4000/ 6200 | consumed samples: 4096000 | consumed tokens: 8388608000 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690022E+00 | loss scale: 4096.0 | grad norm: 6.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.669 | TFLOPs: 42.27 | +[default0]:[2022-10-06 21:19:11,382] [INFO] [logging.py:68:log_dist] [Rank 0] step=4000, skipped=10, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default0]:steps: 4000 loss: 1.6900 iter time (s): 7.372 samples/sec: 138.897 +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 4000 | lm loss value: 3.593558E+00 | lm loss PPL: 3.636322E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 4000 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 21:19:55,631] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4000 is begin to save! +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 4000 | lm loss value: 1.596153E+00 | lm loss PPL: 4.934012E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:[2022-10-06 21:19:55,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,045] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,076] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,105] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,136] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,166] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,196] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,226] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,255] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,283] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,312] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,344] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,371] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,400] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,429] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,457] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,486] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,513] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,541] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,570] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,602] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,630] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,659] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,716] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 21:19:56,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,743] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 21:19:56,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 21:19:56,745] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/mp_rank_00_model_states.pt +[default0]:[2022-10-06 21:19:56,745] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 21:19:56,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 21:19:56,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 21:19:56,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 21:19:56,965] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 21:19:56,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 21:19:56,958] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 21:19:56,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 21:19:56,966] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 21:19:56,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 21:19:56,943] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 21:19:56,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 21:19:56,956] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 21:19:57,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 21:19:57,045] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 21:19:57,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 21:19:57,041] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 21:19:57,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 21:19:57,051] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 21:19:57,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 21:19:57,069] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 21:19:57,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 21:19:57,084] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 21:19:57,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 21:19:57,071] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 21:19:57,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 21:19:57,022] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 21:19:57,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 21:19:57,090] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 21:19:57,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 21:19:57,066] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 21:19:57,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 21:19:57,101] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 21:19:57,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 21:19:57,068] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 21:19:57,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 21:19:57,026] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 21:19:57,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 21:19:57,061] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 21:19:57,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 21:19:57,059] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 21:19:57,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 21:19:57,067] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 21:19:57,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 21:19:57,116] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 21:19:57,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 21:19:57,087] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 21:19:57,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 21:19:57,050] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 21:19:57,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 21:19:57,071] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 21:19:57,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 21:19:57,140] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 21:19:57,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 21:19:57,076] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 21:19:57,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 21:19:57,087] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 21:19:57,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 21:19:57,082] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 21:19:57,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 21:19:57,215] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default5]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default2]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default7]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default3]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default1]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default4]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default0]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default0]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default4]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default5]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default7]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default6]:[2022-10-06 21:19:57,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 21:19:57,196] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default5]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default6]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default4]:[2022-10-06 21:19:57,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 21:19:57,224] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default0]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default1]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default2]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default3]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default6]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default5]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 21:19:57,237] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4000/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default3]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default3]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default4]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default1]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default1]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default2]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default7]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default6]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default7]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default7]:time (ms) | save-checkpoint: 1607.63 +[default2]:[2022-10-06 21:19:57,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +[default0]: successfully saved checkpoint at iteration 4000 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default7]: iteration 4001/ 6200 | consumed samples: 4097024 | consumed tokens: 8390705152 | elapsed time per iteration (s): 53.24 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695011E+00 | loss scale: 4096.0 | grad norm: 5.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.232 | TFLOPs: 5.86 | +[default7]: iteration 4002/ 6200 | consumed samples: 4098048 | consumed tokens: 8392802304 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701049E+00 | loss scale: 4096.0 | grad norm: 5.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.567 | TFLOPs: 42.24 | +[default7]: iteration 4003/ 6200 | consumed samples: 4099072 | consumed tokens: 8394899456 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715336E+00 | loss scale: 4096.0 | grad norm: 5.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.801 | TFLOPs: 42.31 | +[default7]: iteration 4004/ 6200 | consumed samples: 4100096 | consumed tokens: 8396996608 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700933E+00 | loss scale: 4096.0 | grad norm: 6.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.977 | TFLOPs: 42.36 | +[default7]: iteration 4005/ 6200 | consumed samples: 4101120 | consumed tokens: 8399093760 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.730709E+00 | loss scale: 4096.0 | grad norm: 5.594 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.071 | TFLOPs: 42.39 | +[default7]: iteration 4006/ 6200 | consumed samples: 4102144 | consumed tokens: 8401190912 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715546E+00 | loss scale: 4096.0 | grad norm: 5.864 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.124 | TFLOPs: 42.41 | +[default7]: iteration 4007/ 6200 | consumed samples: 4103168 | consumed tokens: 8403288064 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675663E+00 | loss scale: 4096.0 | grad norm: 6.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.205 | TFLOPs: 42.43 | +[default7]: iteration 4008/ 6200 | consumed samples: 4104192 | consumed tokens: 8405385216 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709823E+00 | loss scale: 4096.0 | grad norm: 6.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.641 | TFLOPs: 42.26 | +[default7]: iteration 4009/ 6200 | consumed samples: 4105216 | consumed tokens: 8407482368 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703955E+00 | loss scale: 4096.0 | grad norm: 5.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.364 | TFLOPs: 42.17 | +[default7]: iteration 4010/ 6200 | consumed samples: 4106240 | consumed tokens: 8409579520 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711663E+00 | loss scale: 4096.0 | grad norm: 7.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.715 | TFLOPs: 42.28 | +[default7]: iteration 4011/ 6200 | consumed samples: 4107264 | consumed tokens: 8411676672 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707793E+00 | loss scale: 4096.0 | grad norm: 6.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.739 | TFLOPs: 42.29 | +[default7]: iteration 4012/ 6200 | consumed samples: 4108288 | consumed tokens: 8413773824 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693997E+00 | loss scale: 4096.0 | grad norm: 5.830 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.573 | TFLOPs: 42.24 | +[default7]: iteration 4013/ 6200 | consumed samples: 4109312 | consumed tokens: 8415870976 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704755E+00 | loss scale: 4096.0 | grad norm: 6.761 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.580 | TFLOPs: 42.24 | +[default7]: iteration 4014/ 6200 | consumed samples: 4110336 | consumed tokens: 8417968128 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710014E+00 | loss scale: 4096.0 | grad norm: 6.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.172 | TFLOPs: 42.42 | +[default7]: iteration 4015/ 6200 | consumed samples: 4111360 | consumed tokens: 8420065280 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701472E+00 | loss scale: 4096.0 | grad norm: 5.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.921 | TFLOPs: 42.34 | +[default7]: iteration 4016/ 6200 | consumed samples: 4112384 | consumed tokens: 8422162432 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704192E+00 | loss scale: 4096.0 | grad norm: 4.745 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.335 | TFLOPs: 42.47 | +[default7]: iteration 4017/ 6200 | consumed samples: 4113408 | consumed tokens: 8424259584 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702753E+00 | loss scale: 4096.0 | grad norm: 5.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.208 | TFLOPs: 42.43 | +[default7]: iteration 4018/ 6200 | consumed samples: 4114432 | consumed tokens: 8426356736 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692008E+00 | loss scale: 4096.0 | grad norm: 4.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.997 | TFLOPs: 42.37 | +[default7]: iteration 4019/ 6200 | consumed samples: 4115456 | consumed tokens: 8428453888 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678884E+00 | loss scale: 4096.0 | grad norm: 7.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.967 | TFLOPs: 42.36 | +[default7]: iteration 4020/ 6200 | consumed samples: 4116480 | consumed tokens: 8430551040 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703050E+00 | loss scale: 4096.0 | grad norm: 4.911 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.199 | TFLOPs: 42.43 | +[default7]: iteration 4021/ 6200 | consumed samples: 4117504 | consumed tokens: 8432648192 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685881E+00 | loss scale: 4096.0 | grad norm: 5.571 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.996 | TFLOPs: 42.37 | +[default7]: iteration 4022/ 6200 | consumed samples: 4118528 | consumed tokens: 8434745344 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705652E+00 | loss scale: 4096.0 | grad norm: 4.947 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.210 | TFLOPs: 42.43 | +[default7]: iteration 4023/ 6200 | consumed samples: 4119552 | consumed tokens: 8436842496 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691102E+00 | loss scale: 4096.0 | grad norm: 5.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.159 | TFLOPs: 42.42 | +[default7]: iteration 4024/ 6200 | consumed samples: 4120576 | consumed tokens: 8438939648 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665214E+00 | loss scale: 4096.0 | grad norm: 5.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.321 | TFLOPs: 42.47 | +[default7]: iteration 4025/ 6200 | consumed samples: 4121600 | consumed tokens: 8441036800 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679754E+00 | loss scale: 4096.0 | grad norm: 5.882 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.153 | TFLOPs: 42.41 | +[default7]: iteration 4026/ 6200 | consumed samples: 4122624 | consumed tokens: 8443133952 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702534E+00 | loss scale: 4096.0 | grad norm: 4.660 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.013 | TFLOPs: 42.37 | +[default7]: iteration 4027/ 6200 | consumed samples: 4123648 | consumed tokens: 8445231104 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.719972E+00 | loss scale: 4096.0 | grad norm: 5.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.960 | TFLOPs: 42.36 | +[default7]: iteration 4028/ 6200 | consumed samples: 4124672 | consumed tokens: 8447328256 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678553E+00 | loss scale: 4096.0 | grad norm: 6.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.091 | TFLOPs: 42.40 | +[default7]: iteration 4029/ 6200 | consumed samples: 4125696 | consumed tokens: 8449425408 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680534E+00 | loss scale: 4096.0 | grad norm: 5.980 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.059 | TFLOPs: 42.39 | +[default7]: iteration 4030/ 6200 | consumed samples: 4126720 | consumed tokens: 8451522560 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714358E+00 | loss scale: 4096.0 | grad norm: 5.613 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.140 | TFLOPs: 42.41 | +[default7]: iteration 4031/ 6200 | consumed samples: 4127744 | consumed tokens: 8453619712 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703385E+00 | loss scale: 4096.0 | grad norm: 5.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.058 | TFLOPs: 42.39 | +[default7]: iteration 4032/ 6200 | consumed samples: 4128768 | consumed tokens: 8455716864 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686731E+00 | loss scale: 4096.0 | grad norm: 6.960 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.805 | TFLOPs: 42.31 | +[default7]: iteration 4033/ 6200 | consumed samples: 4129792 | consumed tokens: 8457814016 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705388E+00 | loss scale: 4096.0 | grad norm: 5.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.122 | TFLOPs: 42.41 | +[default7]: iteration 4034/ 6200 | consumed samples: 4130816 | consumed tokens: 8459911168 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685410E+00 | loss scale: 4096.0 | grad norm: 4.792 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.128 | TFLOPs: 42.41 | +[default7]: iteration 4035/ 6200 | consumed samples: 4131840 | consumed tokens: 8462008320 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.730334E+00 | loss scale: 4096.0 | grad norm: 5.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.800 | TFLOPs: 42.31 | +[default7]: iteration 4036/ 6200 | consumed samples: 4132864 | consumed tokens: 8464105472 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698843E+00 | loss scale: 4096.0 | grad norm: 5.734 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.247 | TFLOPs: 42.44 | +[default7]: iteration 4037/ 6200 | consumed samples: 4133888 | consumed tokens: 8466202624 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700488E+00 | loss scale: 4096.0 | grad norm: 6.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.006 | TFLOPs: 42.37 | +[default7]: iteration 4038/ 6200 | consumed samples: 4134912 | consumed tokens: 8468299776 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708323E+00 | loss scale: 4096.0 | grad norm: 4.911 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.658 | TFLOPs: 42.26 | +[default7]: iteration 4039/ 6200 | consumed samples: 4135936 | consumed tokens: 8470396928 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711785E+00 | loss scale: 4096.0 | grad norm: 4.840 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.840 | TFLOPs: 42.32 | +[default7]: iteration 4040/ 6200 | consumed samples: 4136960 | consumed tokens: 8472494080 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698064E+00 | loss scale: 4096.0 | grad norm: 4.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.913 | TFLOPs: 42.34 | +[default7]: iteration 4041/ 6200 | consumed samples: 4137984 | consumed tokens: 8474591232 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685786E+00 | loss scale: 4096.0 | grad norm: 6.651 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.987 | TFLOPs: 42.36 | +[default7]: iteration 4042/ 6200 | consumed samples: 4139008 | consumed tokens: 8476688384 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681535E+00 | loss scale: 4096.0 | grad norm: 5.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.201 | TFLOPs: 42.43 | +[default7]: iteration 4043/ 6200 | consumed samples: 4140032 | consumed tokens: 8478785536 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695192E+00 | loss scale: 4096.0 | grad norm: 5.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.244 | TFLOPs: 42.44 | +[default7]: iteration 4044/ 6200 | consumed samples: 4141056 | consumed tokens: 8480882688 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707617E+00 | loss scale: 4096.0 | grad norm: 5.776 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.496 | TFLOPs: 42.52 | +[default7]: iteration 4045/ 6200 | consumed samples: 4142080 | consumed tokens: 8482979840 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702762E+00 | loss scale: 4096.0 | grad norm: 5.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.122 | TFLOPs: 42.41 | +[default7]: iteration 4046/ 6200 | consumed samples: 4143104 | consumed tokens: 8485076992 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702592E+00 | loss scale: 4096.0 | grad norm: 5.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.815 | TFLOPs: 42.31 | +[default7]: iteration 4047/ 6200 | consumed samples: 4144128 | consumed tokens: 8487174144 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701967E+00 | loss scale: 4096.0 | grad norm: 5.768 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.930 | TFLOPs: 42.35 | +[default7]: iteration 4048/ 6200 | consumed samples: 4145152 | consumed tokens: 8489271296 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.722991E+00 | loss scale: 4096.0 | grad norm: 5.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.111 | TFLOPs: 42.40 | +[default7]: iteration 4049/ 6200 | consumed samples: 4146176 | consumed tokens: 8491368448 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688531E+00 | loss scale: 4096.0 | grad norm: 4.869 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.998 | TFLOPs: 42.37 | +[default7]: iteration 4050/ 6200 | consumed samples: 4147200 | consumed tokens: 8493465600 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696470E+00 | loss scale: 4096.0 | grad norm: 5.842 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.087 | TFLOPs: 42.39 | +[default7]: iteration 4051/ 6200 | consumed samples: 4148224 | consumed tokens: 8495562752 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712663E+00 | loss scale: 4096.0 | grad norm: 5.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.278 | TFLOPs: 42.45 | +[default7]: iteration 4052/ 6200 | consumed samples: 4149248 | consumed tokens: 8497659904 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680116E+00 | loss scale: 4096.0 | grad norm: 5.880 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.975 | TFLOPs: 42.36 | +[default7]: iteration 4053/ 6200 | consumed samples: 4150272 | consumed tokens: 8499757056 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706388E+00 | loss scale: 4096.0 | grad norm: 5.916 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.040 | TFLOPs: 42.38 | +[default7]: iteration 4054/ 6200 | consumed samples: 4151296 | consumed tokens: 8501854208 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709595E+00 | loss scale: 4096.0 | grad norm: 5.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.212 | TFLOPs: 42.43 | +[default7]: iteration 4055/ 6200 | consumed samples: 4152320 | consumed tokens: 8503951360 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711353E+00 | loss scale: 4096.0 | grad norm: 6.829 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.905 | TFLOPs: 42.34 | +[default7]: iteration 4056/ 6200 | consumed samples: 4153344 | consumed tokens: 8506048512 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692833E+00 | loss scale: 4096.0 | grad norm: 5.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.751 | TFLOPs: 42.29 | +[default7]: iteration 4057/ 6200 | consumed samples: 4154368 | consumed tokens: 8508145664 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702467E+00 | loss scale: 4096.0 | grad norm: 5.553 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.228 | TFLOPs: 42.44 | +[default7]: iteration 4058/ 6200 | consumed samples: 4155392 | consumed tokens: 8510242816 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687917E+00 | loss scale: 4096.0 | grad norm: 5.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.810 | TFLOPs: 42.31 | +[default7]: iteration 4059/ 6200 | consumed samples: 4156416 | consumed tokens: 8512339968 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704434E+00 | loss scale: 4096.0 | grad norm: 5.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.853 | TFLOPs: 42.32 | +[default7]: iteration 4060/ 6200 | consumed samples: 4157440 | consumed tokens: 8514437120 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714131E+00 | loss scale: 4096.0 | grad norm: 4.668 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.331 | TFLOPs: 42.47 | +[default7]: iteration 4061/ 6200 | consumed samples: 4158464 | consumed tokens: 8516534272 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707743E+00 | loss scale: 4096.0 | grad norm: 6.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.771 | TFLOPs: 42.30 | +[default7]: iteration 4062/ 6200 | consumed samples: 4159488 | consumed tokens: 8518631424 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678300E+00 | loss scale: 4096.0 | grad norm: 5.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.907 | TFLOPs: 42.34 | +[default7]: iteration 4063/ 6200 | consumed samples: 4160512 | consumed tokens: 8520728576 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689679E+00 | loss scale: 4096.0 | grad norm: 5.604 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.006 | TFLOPs: 42.37 | +[default7]: iteration 4064/ 6200 | consumed samples: 4161536 | consumed tokens: 8522825728 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680867E+00 | loss scale: 4096.0 | grad norm: 6.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.048 | TFLOPs: 42.38 | +[default7]: iteration 4065/ 6200 | consumed samples: 4162560 | consumed tokens: 8524922880 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703983E+00 | loss scale: 4096.0 | grad norm: 5.781 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.039 | TFLOPs: 42.38 | +[default7]: iteration 4066/ 6200 | consumed samples: 4163584 | consumed tokens: 8527020032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683380E+00 | loss scale: 4096.0 | grad norm: 5.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.883 | TFLOPs: 42.33 | +[default7]: iteration 4067/ 6200 | consumed samples: 4164608 | consumed tokens: 8529117184 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682936E+00 | loss scale: 4096.0 | grad norm: 5.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.954 | TFLOPs: 42.35 | +[default7]: iteration 4068/ 6200 | consumed samples: 4165632 | consumed tokens: 8531214336 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.724413E+00 | loss scale: 4096.0 | grad norm: 5.931 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.939 | TFLOPs: 42.35 | +[default7]: iteration 4069/ 6200 | consumed samples: 4166656 | consumed tokens: 8533311488 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696474E+00 | loss scale: 4096.0 | grad norm: 5.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.217 | TFLOPs: 42.43 | +[default7]: iteration 4070/ 6200 | consumed samples: 4167680 | consumed tokens: 8535408640 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675079E+00 | loss scale: 4096.0 | grad norm: 7.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.042 | TFLOPs: 42.38 | +[default7]: iteration 4071/ 6200 | consumed samples: 4168704 | consumed tokens: 8537505792 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706322E+00 | loss scale: 4096.0 | grad norm: 5.055 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.908 | TFLOPs: 42.34 | +[default7]: iteration 4072/ 6200 | consumed samples: 4169728 | consumed tokens: 8539602944 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685627E+00 | loss scale: 4096.0 | grad norm: 5.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.056 | TFLOPs: 42.39 | +[default7]: iteration 4073/ 6200 | consumed samples: 4170752 | consumed tokens: 8541700096 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704465E+00 | loss scale: 4096.0 | grad norm: 5.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.969 | TFLOPs: 42.36 | +[default7]: iteration 4074/ 6200 | consumed samples: 4171776 | consumed tokens: 8543797248 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707034E+00 | loss scale: 4096.0 | grad norm: 5.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.568 | TFLOPs: 42.24 | +[default7]: iteration 4075/ 6200 | consumed samples: 4172800 | consumed tokens: 8545894400 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708042E+00 | loss scale: 4096.0 | grad norm: 5.029 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.805 | TFLOPs: 42.31 | +[default7]: iteration 4076/ 6200 | consumed samples: 4173824 | consumed tokens: 8547991552 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695222E+00 | loss scale: 4096.0 | grad norm: 5.801 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.390 | TFLOPs: 42.49 | +[default7]: iteration 4077/ 6200 | consumed samples: 4174848 | consumed tokens: 8550088704 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695062E+00 | loss scale: 4096.0 | grad norm: 5.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.284 | TFLOPs: 42.45 | +[default7]: iteration 4078/ 6200 | consumed samples: 4175872 | consumed tokens: 8552185856 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689417E+00 | loss scale: 4096.0 | grad norm: 5.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.128 | TFLOPs: 42.41 | +[default7]: iteration 4079/ 6200 | consumed samples: 4176896 | consumed tokens: 8554283008 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650236E+00 | loss scale: 4096.0 | grad norm: 4.858 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.244 | TFLOPs: 42.44 | +[default7]: iteration 4080/ 6200 | consumed samples: 4177920 | consumed tokens: 8556380160 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688374E+00 | loss scale: 4096.0 | grad norm: 6.037 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.735 | TFLOPs: 42.29 | +[default7]: iteration 4081/ 6200 | consumed samples: 4178944 | consumed tokens: 8558477312 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717727E+00 | loss scale: 4096.0 | grad norm: 6.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.801 | TFLOPs: 42.31 | +[default7]: iteration 4082/ 6200 | consumed samples: 4179968 | consumed tokens: 8560574464 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698888E+00 | loss scale: 4096.0 | grad norm: 5.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.646 | TFLOPs: 42.26 | +[default7]: iteration 4083/ 6200 | consumed samples: 4180992 | consumed tokens: 8562671616 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.727278E+00 | loss scale: 4096.0 | grad norm: 5.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.584 | TFLOPs: 42.24 | +[default7]: iteration 4084/ 6200 | consumed samples: 4182016 | consumed tokens: 8564768768 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687560E+00 | loss scale: 4096.0 | grad norm: 5.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.831 | TFLOPs: 42.32 | +[default7]: iteration 4085/ 6200 | consumed samples: 4183040 | consumed tokens: 8566865920 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704387E+00 | loss scale: 4096.0 | grad norm: 5.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.993 | TFLOPs: 42.37 | +[default7]: iteration 4086/ 6200 | consumed samples: 4184064 | consumed tokens: 8568963072 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685208E+00 | loss scale: 4096.0 | grad norm: 5.644 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.713 | TFLOPs: 42.28 | +[default7]: iteration 4087/ 6200 | consumed samples: 4185088 | consumed tokens: 8571060224 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714897E+00 | loss scale: 4096.0 | grad norm: 5.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.865 | TFLOPs: 42.33 | +[default7]: iteration 4088/ 6200 | consumed samples: 4186112 | consumed tokens: 8573157376 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698263E+00 | loss scale: 4096.0 | grad norm: 5.917 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.835 | TFLOPs: 42.32 | +[default7]: iteration 4089/ 6200 | consumed samples: 4187136 | consumed tokens: 8575254528 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680889E+00 | loss scale: 4096.0 | grad norm: 5.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.769 | TFLOPs: 42.30 | +[default7]: iteration 4090/ 6200 | consumed samples: 4188160 | consumed tokens: 8577351680 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695126E+00 | loss scale: 4096.0 | grad norm: 6.013 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.012 | TFLOPs: 42.37 | +[default7]: iteration 4091/ 6200 | consumed samples: 4189184 | consumed tokens: 8579448832 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695305E+00 | loss scale: 4096.0 | grad norm: 6.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.755 | TFLOPs: 42.29 | +[default7]: iteration 4092/ 6200 | consumed samples: 4190208 | consumed tokens: 8581545984 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712757E+00 | loss scale: 4096.0 | grad norm: 4.859 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.853 | TFLOPs: 42.32 | +[default7]: iteration 4093/ 6200 | consumed samples: 4191232 | consumed tokens: 8583643136 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712820E+00 | loss scale: 4096.0 | grad norm: 6.072 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.629 | TFLOPs: 42.26 | +[default7]: iteration 4094/ 6200 | consumed samples: 4192256 | consumed tokens: 8585740288 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686809E+00 | loss scale: 4096.0 | grad norm: 7.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.741 | TFLOPs: 42.29 | +[default7]: iteration 4095/ 6200 | consumed samples: 4193280 | consumed tokens: 8587837440 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668917E+00 | loss scale: 4096.0 | grad norm: 7.942 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.582 | TFLOPs: 42.24 | +[default7]: iteration 4096/ 6200 | consumed samples: 4194304 | consumed tokens: 8589934592 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681205E+00 | loss scale: 4096.0 | grad norm: 6.954 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.971 | TFLOPs: 42.36 | +[default7]: iteration 4097/ 6200 | consumed samples: 4195328 | consumed tokens: 8592031744 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693584E+00 | loss scale: 4096.0 | grad norm: 7.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.416 | TFLOPs: 42.19 | +[default7]: iteration 4098/ 6200 | consumed samples: 4196352 | consumed tokens: 8594128896 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690562E+00 | loss scale: 4096.0 | grad norm: 5.646 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.578 | TFLOPs: 42.24 | +[default7]: iteration 4099/ 6200 | consumed samples: 4197376 | consumed tokens: 8596226048 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705906E+00 | loss scale: 4096.0 | grad norm: 5.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.522 | TFLOPs: 42.22 | +[default7]: iteration 4100/ 6200 | consumed samples: 4198400 | consumed tokens: 8598323200 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714047E+00 | loss scale: 4096.0 | grad norm: 6.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.818 | TFLOPs: 42.01 | +[default7]: iteration 4101/ 6200 | consumed samples: 4199424 | consumed tokens: 8600420352 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685333E+00 | loss scale: 4096.0 | grad norm: 6.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.824 | TFLOPs: 42.31 | +[default7]: iteration 4102/ 6200 | consumed samples: 4200448 | consumed tokens: 8602517504 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699389E+00 | loss scale: 4096.0 | grad norm: 4.816 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.794 | TFLOPs: 42.31 | +[default7]: iteration 4103/ 6200 | consumed samples: 4201472 | consumed tokens: 8604614656 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681226E+00 | loss scale: 4096.0 | grad norm: 5.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.733 | TFLOPs: 42.29 | +[default7]: iteration 4104/ 6200 | consumed samples: 4202496 | consumed tokens: 8606711808 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686136E+00 | loss scale: 4096.0 | grad norm: 6.041 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.067 | TFLOPs: 42.39 | +[default7]: iteration 4105/ 6200 | consumed samples: 4203520 | consumed tokens: 8608808960 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703475E+00 | loss scale: 4096.0 | grad norm: 5.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.927 | TFLOPs: 42.35 | +[default7]: iteration 4106/ 6200 | consumed samples: 4204544 | consumed tokens: 8610906112 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701387E+00 | loss scale: 4096.0 | grad norm: 5.912 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.749 | TFLOPs: 42.29 | +[default7]: iteration 4107/ 6200 | consumed samples: 4205568 | consumed tokens: 8613003264 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688676E+00 | loss scale: 4096.0 | grad norm: 5.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.660 | TFLOPs: 42.26 | +[default7]: iteration 4108/ 6200 | consumed samples: 4206592 | consumed tokens: 8615100416 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682701E+00 | loss scale: 4096.0 | grad norm: 6.858 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.477 | TFLOPs: 42.21 | +[default7]: iteration 4109/ 6200 | consumed samples: 4207616 | consumed tokens: 8617197568 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704992E+00 | loss scale: 4096.0 | grad norm: 5.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.386 | TFLOPs: 42.18 | +[default7]: iteration 4110/ 6200 | consumed samples: 4208640 | consumed tokens: 8619294720 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680074E+00 | loss scale: 4096.0 | grad norm: 5.664 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.946 | TFLOPs: 42.35 | +[default7]: iteration 4111/ 6200 | consumed samples: 4209664 | consumed tokens: 8621391872 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703112E+00 | loss scale: 4096.0 | grad norm: 5.793 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.979 | TFLOPs: 42.36 | +[default7]: iteration 4112/ 6200 | consumed samples: 4210688 | consumed tokens: 8623489024 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681573E+00 | loss scale: 4096.0 | grad norm: 5.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.927 | TFLOPs: 42.35 | +[default7]: iteration 4113/ 6200 | consumed samples: 4211712 | consumed tokens: 8625586176 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699773E+00 | loss scale: 4096.0 | grad norm: 5.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.804 | TFLOPs: 42.31 | +[default7]: iteration 4114/ 6200 | consumed samples: 4212736 | consumed tokens: 8627683328 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695552E+00 | loss scale: 4096.0 | grad norm: 4.922 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.724 | TFLOPs: 42.28 | +[default7]: iteration 4115/ 6200 | consumed samples: 4213760 | consumed tokens: 8629780480 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698088E+00 | loss scale: 4096.0 | grad norm: 5.747 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.914 | TFLOPs: 42.34 | +[default7]: iteration 4116/ 6200 | consumed samples: 4214784 | consumed tokens: 8631877632 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687386E+00 | loss scale: 4096.0 | grad norm: 5.946 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.059 | TFLOPs: 42.39 | +[default7]: iteration 4117/ 6200 | consumed samples: 4215808 | consumed tokens: 8633974784 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663714E+00 | loss scale: 4096.0 | grad norm: 5.747 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.008 | TFLOPs: 42.37 | +[default7]: iteration 4118/ 6200 | consumed samples: 4216832 | consumed tokens: 8636071936 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680403E+00 | loss scale: 4096.0 | grad norm: 5.888 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.145 | TFLOPs: 42.41 | +[default7]: iteration 4119/ 6200 | consumed samples: 4217856 | consumed tokens: 8638169088 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682484E+00 | loss scale: 4096.0 | grad norm: 6.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.606 | TFLOPs: 42.25 | +[default7]: iteration 4120/ 6200 | consumed samples: 4218880 | consumed tokens: 8640266240 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705716E+00 | loss scale: 4096.0 | grad norm: 5.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.595 | TFLOPs: 42.24 | +[default7]: iteration 4121/ 6200 | consumed samples: 4219904 | consumed tokens: 8642363392 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698118E+00 | loss scale: 4096.0 | grad norm: 5.779 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.652 | TFLOPs: 42.26 | +[default7]: iteration 4122/ 6200 | consumed samples: 4220928 | consumed tokens: 8644460544 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.713821E+00 | loss scale: 4096.0 | grad norm: 5.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.823 | TFLOPs: 42.31 | +[default7]: iteration 4123/ 6200 | consumed samples: 4221952 | consumed tokens: 8646557696 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682091E+00 | loss scale: 4096.0 | grad norm: 5.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.316 | TFLOPs: 42.16 | +[default7]: iteration 4124/ 6200 | consumed samples: 4222976 | consumed tokens: 8648654848 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688372E+00 | loss scale: 4096.0 | grad norm: 5.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.523 | TFLOPs: 42.22 | +[default7]: iteration 4125/ 6200 | consumed samples: 4224000 | consumed tokens: 8650752000 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708990E+00 | loss scale: 4096.0 | grad norm: 5.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.442 | TFLOPs: 42.20 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 4125 | lm loss value: 3.610909E+00 | lm loss PPL: 3.699968E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 4125 | lm loss value: 1.584054E+00 | lm loss PPL: 4.874677E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 4126/ 6200 | consumed samples: 4225024 | consumed tokens: 8652849152 | elapsed time per iteration (s): 52.01 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698779E+00 | loss scale: 4096.0 | grad norm: 5.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.688 | TFLOPs: 6.00 | +[default7]: iteration 4127/ 6200 | consumed samples: 4226048 | consumed tokens: 8654946304 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688801E+00 | loss scale: 4096.0 | grad norm: 5.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.168 | TFLOPs: 42.11 | +[default7]: iteration 4128/ 6200 | consumed samples: 4227072 | consumed tokens: 8657043456 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690383E+00 | loss scale: 4096.0 | grad norm: 5.913 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.493 | TFLOPs: 42.21 | +[default7]: iteration 4129/ 6200 | consumed samples: 4228096 | consumed tokens: 8659140608 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705452E+00 | loss scale: 4096.0 | grad norm: 6.758 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.766 | TFLOPs: 42.30 | +[default7]: iteration 4130/ 6200 | consumed samples: 4229120 | consumed tokens: 8661237760 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706177E+00 | loss scale: 4096.0 | grad norm: 5.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.095 | TFLOPs: 42.40 | +[default7]: iteration 4131/ 6200 | consumed samples: 4230144 | consumed tokens: 8663334912 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706438E+00 | loss scale: 4096.0 | grad norm: 6.003 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.701 | TFLOPs: 42.28 | +[default7]: iteration 4132/ 6200 | consumed samples: 4231168 | consumed tokens: 8665432064 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674756E+00 | loss scale: 4096.0 | grad norm: 6.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.179 | TFLOPs: 42.42 | +[default7]: iteration 4133/ 6200 | consumed samples: 4232192 | consumed tokens: 8667529216 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690593E+00 | loss scale: 4096.0 | grad norm: 7.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.675 | TFLOPs: 42.27 | +[default7]: iteration 4134/ 6200 | consumed samples: 4233216 | consumed tokens: 8669626368 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675550E+00 | loss scale: 4096.0 | grad norm: 5.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.697 | TFLOPs: 42.28 | +[default7]: iteration 4135/ 6200 | consumed samples: 4234240 | consumed tokens: 8671723520 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697971E+00 | loss scale: 4096.0 | grad norm: 5.817 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.681 | TFLOPs: 42.27 | +[default7]: iteration 4136/ 6200 | consumed samples: 4235264 | consumed tokens: 8673820672 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662279E+00 | loss scale: 4096.0 | grad norm: 6.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.698 | TFLOPs: 42.28 | +[default7]: iteration 4137/ 6200 | consumed samples: 4236288 | consumed tokens: 8675917824 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672168E+00 | loss scale: 4096.0 | grad norm: 6.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.999 | TFLOPs: 42.37 | +[default7]: iteration 4138/ 6200 | consumed samples: 4237312 | consumed tokens: 8678014976 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650203E+00 | loss scale: 4096.0 | grad norm: 5.076 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.937 | TFLOPs: 42.35 | +[default7]: iteration 4139/ 6200 | consumed samples: 4238336 | consumed tokens: 8680112128 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707514E+00 | loss scale: 4096.0 | grad norm: 6.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.054 | TFLOPs: 42.38 | +[default7]: iteration 4140/ 6200 | consumed samples: 4239360 | consumed tokens: 8682209280 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694432E+00 | loss scale: 4096.0 | grad norm: 6.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.516 | TFLOPs: 42.22 | +[default7]: iteration 4141/ 6200 | consumed samples: 4240384 | consumed tokens: 8684306432 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689209E+00 | loss scale: 4096.0 | grad norm: 5.009 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.287 | TFLOPs: 42.15 | +[default7]: iteration 4142/ 6200 | consumed samples: 4241408 | consumed tokens: 8686403584 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695280E+00 | loss scale: 4096.0 | grad norm: 5.721 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.643 | TFLOPs: 42.26 | +[default7]: iteration 4143/ 6200 | consumed samples: 4242432 | consumed tokens: 8688500736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675748E+00 | loss scale: 4096.0 | grad norm: 5.879 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.625 | TFLOPs: 42.25 | +[default7]: iteration 4144/ 6200 | consumed samples: 4243456 | consumed tokens: 8690597888 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694854E+00 | loss scale: 4096.0 | grad norm: 6.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.722 | TFLOPs: 42.28 | +[default7]: iteration 4145/ 6200 | consumed samples: 4244480 | consumed tokens: 8692695040 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698095E+00 | loss scale: 4096.0 | grad norm: 5.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.582 | TFLOPs: 42.24 | +[default7]: iteration 4146/ 6200 | consumed samples: 4245504 | consumed tokens: 8694792192 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692704E+00 | loss scale: 4096.0 | grad norm: 5.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.593 | TFLOPs: 42.24 | +[default7]: iteration 4147/ 6200 | consumed samples: 4246528 | consumed tokens: 8696889344 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675253E+00 | loss scale: 4096.0 | grad norm: 4.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.641 | TFLOPs: 42.26 | +[default7]: iteration 4148/ 6200 | consumed samples: 4247552 | consumed tokens: 8698986496 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658046E+00 | loss scale: 4096.0 | grad norm: 5.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.479 | TFLOPs: 42.21 | +[default7]: iteration 4149/ 6200 | consumed samples: 4248576 | consumed tokens: 8701083648 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653647E+00 | loss scale: 4096.0 | grad norm: 5.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.600 | TFLOPs: 42.25 | +[default7]: iteration 4150/ 6200 | consumed samples: 4249600 | consumed tokens: 8703180800 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680193E+00 | loss scale: 4096.0 | grad norm: 4.961 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.645 | TFLOPs: 42.26 | +[default7]: iteration 4151/ 6200 | consumed samples: 4250624 | consumed tokens: 8705277952 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680228E+00 | loss scale: 4096.0 | grad norm: 5.876 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.387 | TFLOPs: 42.18 | +[default7]: iteration 4152/ 6200 | consumed samples: 4251648 | consumed tokens: 8707375104 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668338E+00 | loss scale: 4096.0 | grad norm: 6.668 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.583 | TFLOPs: 42.24 | +[default7]: iteration 4153/ 6200 | consumed samples: 4252672 | consumed tokens: 8709472256 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698354E+00 | loss scale: 4096.0 | grad norm: 5.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.430 | TFLOPs: 42.19 | +[default7]: iteration 4154/ 6200 | consumed samples: 4253696 | consumed tokens: 8711569408 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674368E+00 | loss scale: 4096.0 | grad norm: 4.975 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.366 | TFLOPs: 42.18 | +[default7]: iteration 4155/ 6200 | consumed samples: 4254720 | consumed tokens: 8713666560 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693372E+00 | loss scale: 4096.0 | grad norm: 5.993 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.711 | TFLOPs: 42.28 | +[default7]: iteration 4156/ 6200 | consumed samples: 4255744 | consumed tokens: 8715763712 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701366E+00 | loss scale: 4096.0 | grad norm: 5.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.741 | TFLOPs: 42.29 | +[default7]: iteration 4157/ 6200 | consumed samples: 4256768 | consumed tokens: 8717860864 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694698E+00 | loss scale: 4096.0 | grad norm: 6.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.651 | TFLOPs: 42.26 | +[default7]: iteration 4158/ 6200 | consumed samples: 4257792 | consumed tokens: 8719958016 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698865E+00 | loss scale: 4096.0 | grad norm: 5.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.572 | TFLOPs: 42.24 | +[default7]: iteration 4159/ 6200 | consumed samples: 4258816 | consumed tokens: 8722055168 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690796E+00 | loss scale: 4096.0 | grad norm: 5.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.471 | TFLOPs: 42.21 | +[default7]: iteration 4160/ 6200 | consumed samples: 4259840 | consumed tokens: 8724152320 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694864E+00 | loss scale: 4096.0 | grad norm: 5.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.564 | TFLOPs: 42.24 | +[default7]: iteration 4161/ 6200 | consumed samples: 4260864 | consumed tokens: 8726249472 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697058E+00 | loss scale: 4096.0 | grad norm: 5.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.694 | TFLOPs: 42.28 | +[default7]: iteration 4162/ 6200 | consumed samples: 4261888 | consumed tokens: 8728346624 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698377E+00 | loss scale: 4096.0 | grad norm: 5.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.833 | TFLOPs: 42.32 | +[default7]: iteration 4163/ 6200 | consumed samples: 4262912 | consumed tokens: 8730443776 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690586E+00 | loss scale: 4096.0 | grad norm: 5.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.754 | TFLOPs: 42.29 | +[default7]: iteration 4164/ 6200 | consumed samples: 4263936 | consumed tokens: 8732540928 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691336E+00 | loss scale: 4096.0 | grad norm: 5.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.653 | TFLOPs: 42.26 | +[default7]: iteration 4165/ 6200 | consumed samples: 4264960 | consumed tokens: 8734638080 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702758E+00 | loss scale: 4096.0 | grad norm: 6.612 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.814 | TFLOPs: 42.31 | +[default7]: iteration 4166/ 6200 | consumed samples: 4265984 | consumed tokens: 8736735232 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687151E+00 | loss scale: 4096.0 | grad norm: 6.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.062 | TFLOPs: 42.39 | +[default7]: iteration 4167/ 6200 | consumed samples: 4267008 | consumed tokens: 8738832384 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687587E+00 | loss scale: 4096.0 | grad norm: 4.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.370 | TFLOPs: 42.18 | +[default7]: iteration 4168/ 6200 | consumed samples: 4268032 | consumed tokens: 8740929536 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682304E+00 | loss scale: 4096.0 | grad norm: 6.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.211 | TFLOPs: 42.13 | +[default7]: iteration 4169/ 6200 | consumed samples: 4269056 | consumed tokens: 8743026688 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689081E+00 | loss scale: 4096.0 | grad norm: 5.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.255 | TFLOPs: 42.14 | +[default7]: iteration 4170/ 6200 | consumed samples: 4270080 | consumed tokens: 8745123840 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697425E+00 | loss scale: 4096.0 | grad norm: 6.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.447 | TFLOPs: 42.20 | +[default7]: iteration 4171/ 6200 | consumed samples: 4271104 | consumed tokens: 8747220992 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678530E+00 | loss scale: 4096.0 | grad norm: 4.837 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.623 | TFLOPs: 42.25 | +[default7]: iteration 4172/ 6200 | consumed samples: 4272128 | consumed tokens: 8749318144 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668601E+00 | loss scale: 4096.0 | grad norm: 5.594 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.600 | TFLOPs: 42.25 | +[default7]: iteration 4173/ 6200 | consumed samples: 4273152 | consumed tokens: 8751415296 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695603E+00 | loss scale: 4096.0 | grad norm: 5.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.435 | TFLOPs: 42.20 | +[default7]: iteration 4174/ 6200 | consumed samples: 4274176 | consumed tokens: 8753512448 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679654E+00 | loss scale: 4096.0 | grad norm: 5.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.454 | TFLOPs: 42.20 | +[default7]: iteration 4175/ 6200 | consumed samples: 4275200 | consumed tokens: 8755609600 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683548E+00 | loss scale: 4096.0 | grad norm: 5.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.560 | TFLOPs: 42.23 | +[default7]: iteration 4176/ 6200 | consumed samples: 4276224 | consumed tokens: 8757706752 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712011E+00 | loss scale: 4096.0 | grad norm: 5.055 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.462 | TFLOPs: 42.20 | +[default7]: iteration 4177/ 6200 | consumed samples: 4277248 | consumed tokens: 8759803904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692961E+00 | loss scale: 4096.0 | grad norm: 5.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.697 | TFLOPs: 42.28 | +[default7]: iteration 4178/ 6200 | consumed samples: 4278272 | consumed tokens: 8761901056 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656796E+00 | loss scale: 4096.0 | grad norm: 5.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.533 | TFLOPs: 42.23 | +[default7]: iteration 4179/ 6200 | consumed samples: 4279296 | consumed tokens: 8763998208 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688811E+00 | loss scale: 4096.0 | grad norm: 6.075 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.500 | TFLOPs: 42.22 | +[default7]: iteration 4180/ 6200 | consumed samples: 4280320 | consumed tokens: 8766095360 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686116E+00 | loss scale: 4096.0 | grad norm: 5.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.588 | TFLOPs: 42.24 | +[default7]: iteration 4181/ 6200 | consumed samples: 4281344 | consumed tokens: 8768192512 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682091E+00 | loss scale: 4096.0 | grad norm: 5.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.775 | TFLOPs: 42.30 | +[default7]: iteration 4182/ 6200 | consumed samples: 4282368 | consumed tokens: 8770289664 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680185E+00 | loss scale: 4096.0 | grad norm: 4.673 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.097 | TFLOPs: 42.40 | +[default7]: iteration 4183/ 6200 | consumed samples: 4283392 | consumed tokens: 8772386816 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699715E+00 | loss scale: 4096.0 | grad norm: 5.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.757 | TFLOPs: 42.29 | +[default7]: iteration 4184/ 6200 | consumed samples: 4284416 | consumed tokens: 8774483968 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679858E+00 | loss scale: 4096.0 | grad norm: 4.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.136 | TFLOPs: 42.41 | +[default7]: iteration 4185/ 6200 | consumed samples: 4285440 | consumed tokens: 8776581120 | elapsed time per iteration (s): 7.55 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681754E+00 | loss scale: 4096.0 | grad norm: 6.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 135.697 | TFLOPs: 41.36 | +[default7]: iteration 4186/ 6200 | consumed samples: 4286464 | consumed tokens: 8778678272 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695049E+00 | loss scale: 4096.0 | grad norm: 5.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.916 | TFLOPs: 42.34 | +[default7]: iteration 4187/ 6200 | consumed samples: 4287488 | consumed tokens: 8780775424 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680223E+00 | loss scale: 4096.0 | grad norm: 5.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.096 | TFLOPs: 42.40 | +[default7]: iteration 4188/ 6200 | consumed samples: 4288512 | consumed tokens: 8782872576 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681533E+00 | loss scale: 4096.0 | grad norm: 8.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.476 | TFLOPs: 42.51 | +[default7]: iteration 4189/ 6200 | consumed samples: 4289536 | consumed tokens: 8784969728 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689174E+00 | loss scale: 4096.0 | grad norm: 6.731 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.017 | TFLOPs: 42.37 | +[default7]: iteration 4190/ 6200 | consumed samples: 4290560 | consumed tokens: 8787066880 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712323E+00 | loss scale: 4096.0 | grad norm: 5.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.601 | TFLOPs: 42.25 | +[default7]: iteration 4191/ 6200 | consumed samples: 4291584 | consumed tokens: 8789164032 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679184E+00 | loss scale: 4096.0 | grad norm: 6.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.671 | TFLOPs: 42.27 | +[default7]: iteration 4192/ 6200 | consumed samples: 4292608 | consumed tokens: 8791261184 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671978E+00 | loss scale: 4096.0 | grad norm: 6.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.706 | TFLOPs: 42.28 | +[default7]: iteration 4193/ 6200 | consumed samples: 4293632 | consumed tokens: 8793358336 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656257E+00 | loss scale: 4096.0 | grad norm: 5.069 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.725 | TFLOPs: 42.28 | +[default7]: iteration 4194/ 6200 | consumed samples: 4294656 | consumed tokens: 8795455488 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704244E+00 | loss scale: 4096.0 | grad norm: 5.571 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.496 | TFLOPs: 42.21 | +[default7]: iteration 4195/ 6200 | consumed samples: 4295680 | consumed tokens: 8797552640 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699888E+00 | loss scale: 4096.0 | grad norm: 5.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.607 | TFLOPs: 42.25 | +[default7]: iteration 4196/ 6200 | consumed samples: 4296704 | consumed tokens: 8799649792 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674929E+00 | loss scale: 4096.0 | grad norm: 5.829 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.662 | TFLOPs: 42.27 | +[default7]: iteration 4197/ 6200 | consumed samples: 4297728 | consumed tokens: 8801746944 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695607E+00 | loss scale: 4096.0 | grad norm: 5.722 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.323 | TFLOPs: 42.16 | +[default7]: iteration 4198/ 6200 | consumed samples: 4298752 | consumed tokens: 8803844096 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715836E+00 | loss scale: 4096.0 | grad norm: 5.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.337 | TFLOPs: 42.17 | +[default7]: iteration 4199/ 6200 | consumed samples: 4299776 | consumed tokens: 8805941248 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707268E+00 | loss scale: 4096.0 | grad norm: 5.752 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.832 | TFLOPs: 42.32 | +[default7]: iteration 4200/ 6200 | consumed samples: 4300800 | consumed tokens: 8808038400 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665612E+00 | loss scale: 4096.0 | grad norm: 4.928 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.533 | TFLOPs: 42.23 | +[default7]: iteration 4201/ 6200 | consumed samples: 4301824 | consumed tokens: 8810135552 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678538E+00 | loss scale: 4096.0 | grad norm: 5.714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.776 | TFLOPs: 42.30 | +[default7]: iteration 4202/ 6200 | consumed samples: 4302848 | consumed tokens: 8812232704 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683418E+00 | loss scale: 4096.0 | grad norm: 5.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.600 | TFLOPs: 42.25 | +[default7]: iteration 4203/ 6200 | consumed samples: 4303872 | consumed tokens: 8814329856 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726592E+00 | loss scale: 4096.0 | grad norm: 6.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.560 | TFLOPs: 42.23 | +[default0]:[2022-10-06 21:45:46,838] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096.0, reducing to 4096.0 +[default7]: iteration 4204/ 6200 | consumed samples: 4304896 | consumed tokens: 8816427008 | elapsed time per iteration (s): 7.26 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702784E+00 | loss scale: 4096.0 | grad norm: 6.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 141.085 | TFLOPs: 43.00 | +[default7]: iteration 4205/ 6200 | consumed samples: 4305920 | consumed tokens: 8818524160 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691146E+00 | loss scale: 4096.0 | grad norm: 6.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.704 | TFLOPs: 42.28 | +[default0]:[2022-10-06 21:46:01,490] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[default7]: iteration 4206/ 6200 | consumed samples: 4306944 | consumed tokens: 8820621312 | elapsed time per iteration (s): 7.27 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663456E+00 | loss scale: 2048.0 | grad norm: 6.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 140.875 | TFLOPs: 42.94 | +[default7]: iteration 4207/ 6200 | consumed samples: 4307968 | consumed tokens: 8822718464 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.730267E+00 | loss scale: 2048.0 | grad norm: 5.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.009 | TFLOPs: 42.37 | +[default7]: iteration 4208/ 6200 | consumed samples: 4308992 | consumed tokens: 8824815616 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673244E+00 | loss scale: 2048.0 | grad norm: 5.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.813 | TFLOPs: 42.31 | +[default7]: iteration 4209/ 6200 | consumed samples: 4310016 | consumed tokens: 8826912768 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696213E+00 | loss scale: 2048.0 | grad norm: 5.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.688 | TFLOPs: 42.27 | +[default7]: iteration 4210/ 6200 | consumed samples: 4311040 | consumed tokens: 8829009920 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705287E+00 | loss scale: 2048.0 | grad norm: 5.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.728 | TFLOPs: 42.29 | +[default7]: iteration 4211/ 6200 | consumed samples: 4312064 | consumed tokens: 8831107072 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694509E+00 | loss scale: 2048.0 | grad norm: 5.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.996 | TFLOPs: 42.37 | +[default7]: iteration 4212/ 6200 | consumed samples: 4313088 | consumed tokens: 8833204224 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.725972E+00 | loss scale: 2048.0 | grad norm: 5.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.505 | TFLOPs: 42.22 | +[default7]: iteration 4213/ 6200 | consumed samples: 4314112 | consumed tokens: 8835301376 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687234E+00 | loss scale: 2048.0 | grad norm: 5.975 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.713 | TFLOPs: 42.28 | +[default7]: iteration 4214/ 6200 | consumed samples: 4315136 | consumed tokens: 8837398528 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.684130E+00 | loss scale: 2048.0 | grad norm: 6.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.665 | TFLOPs: 42.27 | +[default7]: iteration 4215/ 6200 | consumed samples: 4316160 | consumed tokens: 8839495680 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.684969E+00 | loss scale: 2048.0 | grad norm: 5.702 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.680 | TFLOPs: 42.27 | +[default7]: iteration 4216/ 6200 | consumed samples: 4317184 | consumed tokens: 8841592832 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700409E+00 | loss scale: 2048.0 | grad norm: 5.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.802 | TFLOPs: 42.31 | +[default7]: iteration 4217/ 6200 | consumed samples: 4318208 | consumed tokens: 8843689984 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689148E+00 | loss scale: 2048.0 | grad norm: 6.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.905 | TFLOPs: 42.34 | +[default7]: iteration 4218/ 6200 | consumed samples: 4319232 | consumed tokens: 8845787136 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708675E+00 | loss scale: 2048.0 | grad norm: 4.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.038 | TFLOPs: 42.38 | +[default7]: iteration 4219/ 6200 | consumed samples: 4320256 | consumed tokens: 8847884288 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678710E+00 | loss scale: 2048.0 | grad norm: 4.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.079 | TFLOPs: 42.39 | +[default7]: iteration 4220/ 6200 | consumed samples: 4321280 | consumed tokens: 8849981440 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678534E+00 | loss scale: 2048.0 | grad norm: 6.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.063 | TFLOPs: 42.39 | +[default7]: iteration 4221/ 6200 | consumed samples: 4322304 | consumed tokens: 8852078592 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666961E+00 | loss scale: 2048.0 | grad norm: 6.593 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.696 | TFLOPs: 42.28 | +[default7]: iteration 4222/ 6200 | consumed samples: 4323328 | consumed tokens: 8854175744 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691288E+00 | loss scale: 2048.0 | grad norm: 6.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.872 | TFLOPs: 42.33 | +[default7]: iteration 4223/ 6200 | consumed samples: 4324352 | consumed tokens: 8856272896 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693848E+00 | loss scale: 2048.0 | grad norm: 5.871 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.646 | TFLOPs: 42.26 | +[default7]: iteration 4224/ 6200 | consumed samples: 4325376 | consumed tokens: 8858370048 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695035E+00 | loss scale: 2048.0 | grad norm: 5.024 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.812 | TFLOPs: 42.31 | +[default7]: iteration 4225/ 6200 | consumed samples: 4326400 | consumed tokens: 8860467200 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707654E+00 | loss scale: 2048.0 | grad norm: 5.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.017 | TFLOPs: 42.37 | +[default7]: iteration 4226/ 6200 | consumed samples: 4327424 | consumed tokens: 8862564352 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700813E+00 | loss scale: 2048.0 | grad norm: 5.906 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.946 | TFLOPs: 42.35 | +[default7]: iteration 4227/ 6200 | consumed samples: 4328448 | consumed tokens: 8864661504 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704351E+00 | loss scale: 2048.0 | grad norm: 5.982 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.067 | TFLOPs: 42.39 | +[default7]: iteration 4228/ 6200 | consumed samples: 4329472 | consumed tokens: 8866758656 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674416E+00 | loss scale: 2048.0 | grad norm: 5.942 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.634 | TFLOPs: 42.26 | +[default7]: iteration 4229/ 6200 | consumed samples: 4330496 | consumed tokens: 8868855808 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661654E+00 | loss scale: 2048.0 | grad norm: 5.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.134 | TFLOPs: 42.41 | +[default7]: iteration 4230/ 6200 | consumed samples: 4331520 | consumed tokens: 8870952960 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680801E+00 | loss scale: 2048.0 | grad norm: 5.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.043 | TFLOPs: 42.38 | +[default7]: iteration 4231/ 6200 | consumed samples: 4332544 | consumed tokens: 8873050112 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660078E+00 | loss scale: 2048.0 | grad norm: 5.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.034 | TFLOPs: 42.38 | +[default7]: iteration 4232/ 6200 | consumed samples: 4333568 | consumed tokens: 8875147264 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706866E+00 | loss scale: 2048.0 | grad norm: 4.864 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.120 | TFLOPs: 42.40 | +[default7]: iteration 4233/ 6200 | consumed samples: 4334592 | consumed tokens: 8877244416 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708000E+00 | loss scale: 2048.0 | grad norm: 5.064 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.848 | TFLOPs: 42.32 | +[default7]: iteration 4234/ 6200 | consumed samples: 4335616 | consumed tokens: 8879341568 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711636E+00 | loss scale: 2048.0 | grad norm: 5.733 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.121 | TFLOPs: 42.41 | +[default7]: iteration 4235/ 6200 | consumed samples: 4336640 | consumed tokens: 8881438720 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663452E+00 | loss scale: 2048.0 | grad norm: 5.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.094 | TFLOPs: 42.40 | +[default7]: iteration 4236/ 6200 | consumed samples: 4337664 | consumed tokens: 8883535872 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682120E+00 | loss scale: 2048.0 | grad norm: 5.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.732 | TFLOPs: 42.29 | +[default7]: iteration 4237/ 6200 | consumed samples: 4338688 | consumed tokens: 8885633024 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701650E+00 | loss scale: 2048.0 | grad norm: 6.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.067 | TFLOPs: 42.39 | +[default7]: iteration 4238/ 6200 | consumed samples: 4339712 | consumed tokens: 8887730176 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676519E+00 | loss scale: 2048.0 | grad norm: 5.847 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.077 | TFLOPs: 42.39 | +[default7]: iteration 4239/ 6200 | consumed samples: 4340736 | consumed tokens: 8889827328 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685132E+00 | loss scale: 2048.0 | grad norm: 6.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.773 | TFLOPs: 42.30 | +[default7]: iteration 4240/ 6200 | consumed samples: 4341760 | consumed tokens: 8891924480 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662474E+00 | loss scale: 2048.0 | grad norm: 5.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.988 | TFLOPs: 42.36 | +[default7]: iteration 4241/ 6200 | consumed samples: 4342784 | consumed tokens: 8894021632 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685211E+00 | loss scale: 2048.0 | grad norm: 6.006 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.863 | TFLOPs: 42.33 | +[default7]: iteration 4242/ 6200 | consumed samples: 4343808 | consumed tokens: 8896118784 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666085E+00 | loss scale: 2048.0 | grad norm: 5.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.739 | TFLOPs: 42.29 | +[default7]: iteration 4243/ 6200 | consumed samples: 4344832 | consumed tokens: 8898215936 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679651E+00 | loss scale: 2048.0 | grad norm: 5.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.998 | TFLOPs: 42.37 | +[default7]: iteration 4244/ 6200 | consumed samples: 4345856 | consumed tokens: 8900313088 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686772E+00 | loss scale: 2048.0 | grad norm: 4.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.936 | TFLOPs: 42.35 | +[default7]: iteration 4245/ 6200 | consumed samples: 4346880 | consumed tokens: 8902410240 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714097E+00 | loss scale: 2048.0 | grad norm: 5.689 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.810 | TFLOPs: 42.31 | +[default7]: iteration 4246/ 6200 | consumed samples: 4347904 | consumed tokens: 8904507392 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690158E+00 | loss scale: 2048.0 | grad norm: 4.911 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.120 | TFLOPs: 42.41 | +[default7]: iteration 4247/ 6200 | consumed samples: 4348928 | consumed tokens: 8906604544 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695768E+00 | loss scale: 2048.0 | grad norm: 5.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.941 | TFLOPs: 42.35 | +[default7]: iteration 4248/ 6200 | consumed samples: 4349952 | consumed tokens: 8908701696 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687776E+00 | loss scale: 2048.0 | grad norm: 5.865 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.119 | TFLOPs: 42.40 | +[default7]: iteration 4249/ 6200 | consumed samples: 4350976 | consumed tokens: 8910798848 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679693E+00 | loss scale: 2048.0 | grad norm: 5.909 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.788 | TFLOPs: 42.30 | +[default7]: iteration 4250/ 6200 | consumed samples: 4352000 | consumed tokens: 8912896000 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704954E+00 | loss scale: 2048.0 | grad norm: 5.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.761 | TFLOPs: 42.30 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 4250 | lm loss value: 3.626136E+00 | lm loss PPL: 3.756736E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 4250 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 21:52:09,977] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4250 is begin to save! +[default0]:[2022-10-06 21:52:09,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_01-model_00-model_states.pt... +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 4250 | lm loss value: 1.572493E+00 | lm loss PPL: 4.818644E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:[2022-10-06 21:52:10,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,367] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,426] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,480] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,507] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,535] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,562] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,590] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,617] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,672] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,699] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,780] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,835] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,916] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:10,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:10,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_25-model_00-model_states.pt. +[default2]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 21:52:10,998] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:11,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 21:52:11,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 21:52:11,026] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/mp_rank_00_model_states.pt +[default0]:[2022-10-06 21:52:11,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 21:52:11,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 21:52:11,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 21:52:11,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 21:52:11,238] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 21:52:11,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 21:52:11,249] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 21:52:11,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 21:52:11,257] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 21:52:11,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 21:52:11,228] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 21:52:11,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 21:52:11,240] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 21:52:11,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 21:52:11,323] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 21:52:11,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 21:52:11,243] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 21:52:11,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 21:52:11,322] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 21:52:11,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 21:52:11,339] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 21:52:11,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 21:52:11,341] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 21:52:11,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 21:52:11,339] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 21:52:11,265] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 21:52:11,265] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 21:52:11,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 21:52:11,337] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 21:52:11,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 21:52:11,358] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 21:52:11,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 21:52:11,362] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 21:52:11,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 21:52:11,348] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 21:52:11,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 21:52:11,322] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 21:52:11,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 21:52:11,343] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 21:52:11,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 21:52:11,325] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 21:52:11,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 21:52:11,335] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 21:52:11,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 21:52:11,358] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 21:52:11,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 21:52:11,342] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 21:52:11,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 21:52:11,336] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 21:52:11,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 21:52:11,344] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 21:52:11,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 21:52:11,430] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 21:52:11,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 21:52:11,407] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 21:52:11,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 21:52:11,363] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 21:52:11,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 21:52:11,453] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default1]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default5]:[2022-10-06 21:52:11,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 21:52:11,453] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default3]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default6]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default5]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default0]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default0]:[2022-10-06 21:52:11,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default0]: successfully saved checkpoint at iteration 4250 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default5]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default3]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default2]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default7]:[2022-10-06 21:52:11,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 21:52:11,461] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default7]:time (ms) | save-checkpoint: 1493.62 +[default1]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default4]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default3]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default4]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default4]:[2022-10-06 21:52:11,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 21:52:11,452] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default6]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default2]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default0]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default6]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default4]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default1]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default2]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default7]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default2]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default1]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default5]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default7]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default7]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default0]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default6]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 21:52:11,469] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4250/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 21:52:11,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4250 is ready now! +[default7]: iteration 4251/ 6200 | consumed samples: 4353024 | consumed tokens: 8914993152 | elapsed time per iteration (s): 52.98 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682031E+00 | loss scale: 2048.0 | grad norm: 5.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.329 | TFLOPs: 5.89 | +[default7]: iteration 4252/ 6200 | consumed samples: 4354048 | consumed tokens: 8917090304 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692995E+00 | loss scale: 2048.0 | grad norm: 5.777 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.907 | TFLOPs: 42.34 | +[default7]: iteration 4253/ 6200 | consumed samples: 4355072 | consumed tokens: 8919187456 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656317E+00 | loss scale: 2048.0 | grad norm: 6.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.850 | TFLOPs: 42.32 | +[default7]: iteration 4254/ 6200 | consumed samples: 4356096 | consumed tokens: 8921284608 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665264E+00 | loss scale: 2048.0 | grad norm: 5.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.164 | TFLOPs: 42.42 | +[default7]: iteration 4255/ 6200 | consumed samples: 4357120 | consumed tokens: 8923381760 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.717988E+00 | loss scale: 2048.0 | grad norm: 5.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.087 | TFLOPs: 42.39 | +[default7]: iteration 4256/ 6200 | consumed samples: 4358144 | consumed tokens: 8925478912 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682047E+00 | loss scale: 2048.0 | grad norm: 5.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.149 | TFLOPs: 42.41 | +[default7]: iteration 4257/ 6200 | consumed samples: 4359168 | consumed tokens: 8927576064 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662251E+00 | loss scale: 2048.0 | grad norm: 6.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.169 | TFLOPs: 42.42 | +[default7]: iteration 4258/ 6200 | consumed samples: 4360192 | consumed tokens: 8929673216 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704866E+00 | loss scale: 2048.0 | grad norm: 5.888 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.025 | TFLOPs: 42.38 | +[default7]: iteration 4259/ 6200 | consumed samples: 4361216 | consumed tokens: 8931770368 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714846E+00 | loss scale: 2048.0 | grad norm: 5.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.683 | TFLOPs: 42.27 | +[default7]: iteration 4260/ 6200 | consumed samples: 4362240 | consumed tokens: 8933867520 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677058E+00 | loss scale: 2048.0 | grad norm: 4.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.766 | TFLOPs: 42.30 | +[default7]: iteration 4261/ 6200 | consumed samples: 4363264 | consumed tokens: 8935964672 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670865E+00 | loss scale: 2048.0 | grad norm: 7.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.917 | TFLOPs: 42.34 | +[default7]: iteration 4262/ 6200 | consumed samples: 4364288 | consumed tokens: 8938061824 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690973E+00 | loss scale: 2048.0 | grad norm: 5.726 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.620 | TFLOPs: 42.25 | +[default7]: iteration 4263/ 6200 | consumed samples: 4365312 | consumed tokens: 8940158976 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674972E+00 | loss scale: 2048.0 | grad norm: 4.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.012 | TFLOPs: 42.37 | +[default7]: iteration 4264/ 6200 | consumed samples: 4366336 | consumed tokens: 8942256128 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694934E+00 | loss scale: 2048.0 | grad norm: 4.761 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.059 | TFLOPs: 42.08 | +[default7]: iteration 4265/ 6200 | consumed samples: 4367360 | consumed tokens: 8944353280 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681244E+00 | loss scale: 2048.0 | grad norm: 6.621 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.847 | TFLOPs: 42.32 | +[default7]: iteration 4266/ 6200 | consumed samples: 4368384 | consumed tokens: 8946450432 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679707E+00 | loss scale: 2048.0 | grad norm: 5.579 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.234 | TFLOPs: 42.44 | +[default7]: iteration 4267/ 6200 | consumed samples: 4369408 | consumed tokens: 8948547584 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682755E+00 | loss scale: 2048.0 | grad norm: 5.877 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.949 | TFLOPs: 42.35 | +[default7]: iteration 4268/ 6200 | consumed samples: 4370432 | consumed tokens: 8950644736 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702542E+00 | loss scale: 2048.0 | grad norm: 5.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.820 | TFLOPs: 42.31 | +[default7]: iteration 4269/ 6200 | consumed samples: 4371456 | consumed tokens: 8952741888 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721635E+00 | loss scale: 2048.0 | grad norm: 8.894 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.967 | TFLOPs: 42.36 | +[default7]: iteration 4270/ 6200 | consumed samples: 4372480 | consumed tokens: 8954839040 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668374E+00 | loss scale: 2048.0 | grad norm: 6.874 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.991 | TFLOPs: 42.37 | +[default7]: iteration 4271/ 6200 | consumed samples: 4373504 | consumed tokens: 8956936192 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683824E+00 | loss scale: 2048.0 | grad norm: 4.665 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.957 | TFLOPs: 42.36 | +[default7]: iteration 4272/ 6200 | consumed samples: 4374528 | consumed tokens: 8959033344 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688255E+00 | loss scale: 2048.0 | grad norm: 4.975 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.641 | TFLOPs: 42.26 | +[default7]: iteration 4273/ 6200 | consumed samples: 4375552 | consumed tokens: 8961130496 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689010E+00 | loss scale: 2048.0 | grad norm: 7.780 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.999 | TFLOPs: 42.37 | +[default7]: iteration 4274/ 6200 | consumed samples: 4376576 | consumed tokens: 8963227648 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696503E+00 | loss scale: 2048.0 | grad norm: 5.913 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.792 | TFLOPs: 42.30 | +[default7]: iteration 4275/ 6200 | consumed samples: 4377600 | consumed tokens: 8965324800 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693361E+00 | loss scale: 2048.0 | grad norm: 6.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.710 | TFLOPs: 42.28 | +[default7]: iteration 4276/ 6200 | consumed samples: 4378624 | consumed tokens: 8967421952 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.684224E+00 | loss scale: 2048.0 | grad norm: 6.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.874 | TFLOPs: 42.33 | +[default7]: iteration 4277/ 6200 | consumed samples: 4379648 | consumed tokens: 8969519104 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698049E+00 | loss scale: 2048.0 | grad norm: 5.923 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.758 | TFLOPs: 42.29 | +[default7]: iteration 4278/ 6200 | consumed samples: 4380672 | consumed tokens: 8971616256 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686141E+00 | loss scale: 2048.0 | grad norm: 5.804 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.817 | TFLOPs: 42.31 | +[default7]: iteration 4279/ 6200 | consumed samples: 4381696 | consumed tokens: 8973713408 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676606E+00 | loss scale: 2048.0 | grad norm: 5.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.047 | TFLOPs: 42.38 | +[default7]: iteration 4280/ 6200 | consumed samples: 4382720 | consumed tokens: 8975810560 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695441E+00 | loss scale: 2048.0 | grad norm: 4.913 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.806 | TFLOPs: 42.31 | +[default7]: iteration 4281/ 6200 | consumed samples: 4383744 | consumed tokens: 8977907712 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696648E+00 | loss scale: 2048.0 | grad norm: 4.758 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.890 | TFLOPs: 42.33 | +[default7]: iteration 4282/ 6200 | consumed samples: 4384768 | consumed tokens: 8980004864 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686214E+00 | loss scale: 2048.0 | grad norm: 5.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.928 | TFLOPs: 42.35 | +[default7]: iteration 4283/ 6200 | consumed samples: 4385792 | consumed tokens: 8982102016 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694097E+00 | loss scale: 2048.0 | grad norm: 5.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.596 | TFLOPs: 42.25 | +[default7]: iteration 4284/ 6200 | consumed samples: 4386816 | consumed tokens: 8984199168 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.706359E+00 | loss scale: 2048.0 | grad norm: 5.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.771 | TFLOPs: 42.30 | +[default7]: iteration 4285/ 6200 | consumed samples: 4387840 | consumed tokens: 8986296320 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.712801E+00 | loss scale: 2048.0 | grad norm: 5.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.912 | TFLOPs: 42.34 | +[default7]: iteration 4286/ 6200 | consumed samples: 4388864 | consumed tokens: 8988393472 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699111E+00 | loss scale: 2048.0 | grad norm: 5.777 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.660 | TFLOPs: 42.26 | +[default7]: iteration 4287/ 6200 | consumed samples: 4389888 | consumed tokens: 8990490624 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676264E+00 | loss scale: 2048.0 | grad norm: 4.919 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.820 | TFLOPs: 42.31 | +[default7]: iteration 4288/ 6200 | consumed samples: 4390912 | consumed tokens: 8992587776 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649169E+00 | loss scale: 2048.0 | grad norm: 5.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.942 | TFLOPs: 42.35 | +[default7]: iteration 4289/ 6200 | consumed samples: 4391936 | consumed tokens: 8994684928 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699679E+00 | loss scale: 2048.0 | grad norm: 5.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.074 | TFLOPs: 42.39 | +[default7]: iteration 4290/ 6200 | consumed samples: 4392960 | consumed tokens: 8996782080 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677083E+00 | loss scale: 2048.0 | grad norm: 5.016 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.998 | TFLOPs: 42.37 | +[default7]: iteration 4291/ 6200 | consumed samples: 4393984 | consumed tokens: 8998879232 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698455E+00 | loss scale: 2048.0 | grad norm: 4.888 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.989 | TFLOPs: 42.37 | +[default7]: iteration 4292/ 6200 | consumed samples: 4395008 | consumed tokens: 9000976384 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681016E+00 | loss scale: 2048.0 | grad norm: 5.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.707 | TFLOPs: 42.28 | +[default7]: iteration 4293/ 6200 | consumed samples: 4396032 | consumed tokens: 9003073536 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668443E+00 | loss scale: 2048.0 | grad norm: 4.804 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.967 | TFLOPs: 42.36 | +[default7]: iteration 4294/ 6200 | consumed samples: 4397056 | consumed tokens: 9005170688 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679519E+00 | loss scale: 2048.0 | grad norm: 4.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.795 | TFLOPs: 42.31 | +[default7]: iteration 4295/ 6200 | consumed samples: 4398080 | consumed tokens: 9007267840 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.684369E+00 | loss scale: 2048.0 | grad norm: 5.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.671 | TFLOPs: 42.27 | +[default7]: iteration 4296/ 6200 | consumed samples: 4399104 | consumed tokens: 9009364992 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658527E+00 | loss scale: 2048.0 | grad norm: 6.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.682 | TFLOPs: 42.27 | +[default7]: iteration 4297/ 6200 | consumed samples: 4400128 | consumed tokens: 9011462144 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690209E+00 | loss scale: 2048.0 | grad norm: 6.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.474 | TFLOPs: 42.21 | +[default7]: iteration 4298/ 6200 | consumed samples: 4401152 | consumed tokens: 9013559296 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695436E+00 | loss scale: 2048.0 | grad norm: 4.773 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.416 | TFLOPs: 42.19 | +[default7]: iteration 4299/ 6200 | consumed samples: 4402176 | consumed tokens: 9015656448 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699141E+00 | loss scale: 2048.0 | grad norm: 5.778 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.565 | TFLOPs: 42.24 | +[default7]: iteration 4300/ 6200 | consumed samples: 4403200 | consumed tokens: 9017753600 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648547E+00 | loss scale: 2048.0 | grad norm: 5.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.574 | TFLOPs: 42.24 | +[default7]: iteration 4301/ 6200 | consumed samples: 4404224 | consumed tokens: 9019850752 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673389E+00 | loss scale: 2048.0 | grad norm: 5.651 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.603 | TFLOPs: 42.25 | +[default7]: iteration 4302/ 6200 | consumed samples: 4405248 | consumed tokens: 9021947904 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703980E+00 | loss scale: 2048.0 | grad norm: 6.966 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.588 | TFLOPs: 42.24 | +[default7]: iteration 4303/ 6200 | consumed samples: 4406272 | consumed tokens: 9024045056 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679053E+00 | loss scale: 2048.0 | grad norm: 5.870 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.627 | TFLOPs: 42.25 | +[default7]: iteration 4304/ 6200 | consumed samples: 4407296 | consumed tokens: 9026142208 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674317E+00 | loss scale: 2048.0 | grad norm: 4.910 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.477 | TFLOPs: 42.21 | +[default7]: iteration 4305/ 6200 | consumed samples: 4408320 | consumed tokens: 9028239360 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695577E+00 | loss scale: 2048.0 | grad norm: 5.841 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.378 | TFLOPs: 42.18 | +[default7]: iteration 4306/ 6200 | consumed samples: 4409344 | consumed tokens: 9030336512 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691373E+00 | loss scale: 2048.0 | grad norm: 6.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.766 | TFLOPs: 41.99 | +[default7]: iteration 4307/ 6200 | consumed samples: 4410368 | consumed tokens: 9032433664 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699620E+00 | loss scale: 2048.0 | grad norm: 5.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.010 | TFLOPs: 42.07 | +[default7]: iteration 4308/ 6200 | consumed samples: 4411392 | consumed tokens: 9034530816 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705349E+00 | loss scale: 2048.0 | grad norm: 6.740 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.042 | TFLOPs: 42.38 | +[default7]: iteration 4309/ 6200 | consumed samples: 4412416 | consumed tokens: 9036627968 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689903E+00 | loss scale: 2048.0 | grad norm: 6.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.975 | TFLOPs: 42.36 | +[default7]: iteration 4310/ 6200 | consumed samples: 4413440 | consumed tokens: 9038725120 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697687E+00 | loss scale: 2048.0 | grad norm: 8.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.995 | TFLOPs: 42.37 | +[default7]: iteration 4311/ 6200 | consumed samples: 4414464 | consumed tokens: 9040822272 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688023E+00 | loss scale: 2048.0 | grad norm: 5.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.287 | TFLOPs: 42.46 | +[default7]: iteration 4312/ 6200 | consumed samples: 4415488 | consumed tokens: 9042919424 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701939E+00 | loss scale: 2048.0 | grad norm: 6.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.010 | TFLOPs: 42.37 | +[default7]: iteration 4313/ 6200 | consumed samples: 4416512 | consumed tokens: 9045016576 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703962E+00 | loss scale: 2048.0 | grad norm: 6.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.882 | TFLOPs: 42.33 | +[default7]: iteration 4314/ 6200 | consumed samples: 4417536 | consumed tokens: 9047113728 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705045E+00 | loss scale: 2048.0 | grad norm: 6.813 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.298 | TFLOPs: 42.46 | +[default7]: iteration 4315/ 6200 | consumed samples: 4418560 | consumed tokens: 9049210880 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675695E+00 | loss scale: 2048.0 | grad norm: 5.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.284 | TFLOPs: 42.46 | +[default7]: iteration 4316/ 6200 | consumed samples: 4419584 | consumed tokens: 9051308032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686040E+00 | loss scale: 2048.0 | grad norm: 5.687 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.894 | TFLOPs: 42.34 | +[default7]: iteration 4317/ 6200 | consumed samples: 4420608 | consumed tokens: 9053405184 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676151E+00 | loss scale: 2048.0 | grad norm: 5.986 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.931 | TFLOPs: 42.35 | +[default7]: iteration 4318/ 6200 | consumed samples: 4421632 | consumed tokens: 9055502336 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690159E+00 | loss scale: 2048.0 | grad norm: 4.625 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.907 | TFLOPs: 42.34 | +[default7]: iteration 4319/ 6200 | consumed samples: 4422656 | consumed tokens: 9057599488 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666845E+00 | loss scale: 2048.0 | grad norm: 4.970 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.068 | TFLOPs: 42.39 | +[default7]: iteration 4320/ 6200 | consumed samples: 4423680 | consumed tokens: 9059696640 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682232E+00 | loss scale: 2048.0 | grad norm: 5.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.952 | TFLOPs: 42.35 | +[default7]: iteration 4321/ 6200 | consumed samples: 4424704 | consumed tokens: 9061793792 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659348E+00 | loss scale: 2048.0 | grad norm: 5.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.011 | TFLOPs: 42.37 | +[default7]: iteration 4322/ 6200 | consumed samples: 4425728 | consumed tokens: 9063890944 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661076E+00 | loss scale: 2048.0 | grad norm: 5.882 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.052 | TFLOPs: 42.38 | +[default7]: iteration 4323/ 6200 | consumed samples: 4426752 | consumed tokens: 9065988096 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687266E+00 | loss scale: 2048.0 | grad norm: 5.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.839 | TFLOPs: 42.32 | +[default7]: iteration 4324/ 6200 | consumed samples: 4427776 | consumed tokens: 9068085248 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.716892E+00 | loss scale: 2048.0 | grad norm: 5.846 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.906 | TFLOPs: 42.34 | +[default7]: iteration 4325/ 6200 | consumed samples: 4428800 | consumed tokens: 9070182400 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683818E+00 | loss scale: 2048.0 | grad norm: 5.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.989 | TFLOPs: 42.37 | +[default7]: iteration 4326/ 6200 | consumed samples: 4429824 | consumed tokens: 9072279552 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663574E+00 | loss scale: 2048.0 | grad norm: 4.982 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.031 | TFLOPs: 42.38 | +[default7]: iteration 4327/ 6200 | consumed samples: 4430848 | consumed tokens: 9074376704 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691826E+00 | loss scale: 2048.0 | grad norm: 4.774 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.688 | TFLOPs: 42.27 | +[default7]: iteration 4328/ 6200 | consumed samples: 4431872 | consumed tokens: 9076473856 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678177E+00 | loss scale: 2048.0 | grad norm: 5.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.859 | TFLOPs: 42.02 | +[default7]: iteration 4329/ 6200 | consumed samples: 4432896 | consumed tokens: 9078571008 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670979E+00 | loss scale: 2048.0 | grad norm: 6.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.483 | TFLOPs: 42.21 | +[default7]: iteration 4330/ 6200 | consumed samples: 4433920 | consumed tokens: 9080668160 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666656E+00 | loss scale: 2048.0 | grad norm: 6.765 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.431 | TFLOPs: 42.20 | +[default7]: iteration 4331/ 6200 | consumed samples: 4434944 | consumed tokens: 9082765312 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699610E+00 | loss scale: 2048.0 | grad norm: 5.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.627 | TFLOPs: 41.95 | +[default7]: iteration 4332/ 6200 | consumed samples: 4435968 | consumed tokens: 9084862464 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662621E+00 | loss scale: 2048.0 | grad norm: 4.892 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.875 | TFLOPs: 42.03 | +[default7]: iteration 4333/ 6200 | consumed samples: 4436992 | consumed tokens: 9086959616 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681502E+00 | loss scale: 2048.0 | grad norm: 6.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.584 | TFLOPs: 42.24 | +[default7]: iteration 4334/ 6200 | consumed samples: 4438016 | consumed tokens: 9089056768 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697777E+00 | loss scale: 2048.0 | grad norm: 5.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.607 | TFLOPs: 42.25 | +[default7]: iteration 4335/ 6200 | consumed samples: 4439040 | consumed tokens: 9091153920 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701138E+00 | loss scale: 2048.0 | grad norm: 5.635 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.530 | TFLOPs: 42.23 | +[default7]: iteration 4336/ 6200 | consumed samples: 4440064 | consumed tokens: 9093251072 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687738E+00 | loss scale: 2048.0 | grad norm: 6.050 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.498 | TFLOPs: 41.91 | +[default7]: iteration 4337/ 6200 | consumed samples: 4441088 | consumed tokens: 9095348224 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710311E+00 | loss scale: 2048.0 | grad norm: 5.585 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.398 | TFLOPs: 42.18 | +[default7]: iteration 4338/ 6200 | consumed samples: 4442112 | consumed tokens: 9097445376 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707129E+00 | loss scale: 2048.0 | grad norm: 5.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.377 | TFLOPs: 42.18 | +[default7]: iteration 4339/ 6200 | consumed samples: 4443136 | consumed tokens: 9099542528 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696448E+00 | loss scale: 2048.0 | grad norm: 5.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.430 | TFLOPs: 42.19 | +[default7]: iteration 4340/ 6200 | consumed samples: 4444160 | consumed tokens: 9101639680 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698575E+00 | loss scale: 2048.0 | grad norm: 5.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.504 | TFLOPs: 41.91 | +[default7]: iteration 4341/ 6200 | consumed samples: 4445184 | consumed tokens: 9103736832 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671682E+00 | loss scale: 2048.0 | grad norm: 4.742 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.455 | TFLOPs: 42.20 | +[default7]: iteration 4342/ 6200 | consumed samples: 4446208 | consumed tokens: 9105833984 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668300E+00 | loss scale: 2048.0 | grad norm: 5.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.240 | TFLOPs: 42.14 | +[default7]: iteration 4343/ 6200 | consumed samples: 4447232 | consumed tokens: 9107931136 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671455E+00 | loss scale: 2048.0 | grad norm: 5.948 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.249 | TFLOPs: 42.14 | +[default7]: iteration 4344/ 6200 | consumed samples: 4448256 | consumed tokens: 9110028288 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691421E+00 | loss scale: 2048.0 | grad norm: 5.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.345 | TFLOPs: 42.17 | +[default7]: iteration 4345/ 6200 | consumed samples: 4449280 | consumed tokens: 9112125440 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686932E+00 | loss scale: 2048.0 | grad norm: 5.593 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.397 | TFLOPs: 42.18 | +[default7]: iteration 4346/ 6200 | consumed samples: 4450304 | consumed tokens: 9114222592 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679419E+00 | loss scale: 2048.0 | grad norm: 5.874 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.279 | TFLOPs: 42.15 | +[default7]: iteration 4347/ 6200 | consumed samples: 4451328 | consumed tokens: 9116319744 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692818E+00 | loss scale: 2048.0 | grad norm: 5.942 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.411 | TFLOPs: 42.19 | +[default7]: iteration 4348/ 6200 | consumed samples: 4452352 | consumed tokens: 9118416896 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.721150E+00 | loss scale: 2048.0 | grad norm: 5.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.459 | TFLOPs: 42.20 | +[default7]: iteration 4349/ 6200 | consumed samples: 4453376 | consumed tokens: 9120514048 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685299E+00 | loss scale: 2048.0 | grad norm: 5.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.589 | TFLOPs: 42.24 | +[default7]: iteration 4350/ 6200 | consumed samples: 4454400 | consumed tokens: 9122611200 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676496E+00 | loss scale: 2048.0 | grad norm: 7.008 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.680 | TFLOPs: 42.27 | +[default7]: iteration 4351/ 6200 | consumed samples: 4455424 | consumed tokens: 9124708352 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676546E+00 | loss scale: 2048.0 | grad norm: 6.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.626 | TFLOPs: 42.25 | +[default7]: iteration 4352/ 6200 | consumed samples: 4456448 | consumed tokens: 9126805504 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681973E+00 | loss scale: 2048.0 | grad norm: 6.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.550 | TFLOPs: 42.23 | +[default7]: iteration 4353/ 6200 | consumed samples: 4457472 | consumed tokens: 9128902656 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667738E+00 | loss scale: 2048.0 | grad norm: 5.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.550 | TFLOPs: 42.23 | +[default7]: iteration 4354/ 6200 | consumed samples: 4458496 | consumed tokens: 9130999808 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670763E+00 | loss scale: 2048.0 | grad norm: 6.975 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.359 | TFLOPs: 42.17 | +[default7]: iteration 4355/ 6200 | consumed samples: 4459520 | consumed tokens: 9133096960 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668784E+00 | loss scale: 2048.0 | grad norm: 6.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.561 | TFLOPs: 42.23 | +[default7]: iteration 4356/ 6200 | consumed samples: 4460544 | consumed tokens: 9135194112 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665689E+00 | loss scale: 2048.0 | grad norm: 6.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.871 | TFLOPs: 42.33 | +[default7]: iteration 4357/ 6200 | consumed samples: 4461568 | consumed tokens: 9137291264 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700613E+00 | loss scale: 2048.0 | grad norm: 5.752 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.720 | TFLOPs: 42.28 | +[default7]: iteration 4358/ 6200 | consumed samples: 4462592 | consumed tokens: 9139388416 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675818E+00 | loss scale: 2048.0 | grad norm: 4.918 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.698 | TFLOPs: 42.28 | +[default7]: iteration 4359/ 6200 | consumed samples: 4463616 | consumed tokens: 9141485568 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676284E+00 | loss scale: 2048.0 | grad norm: 5.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.628 | TFLOPs: 42.26 | +[default7]: iteration 4360/ 6200 | consumed samples: 4464640 | consumed tokens: 9143582720 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682337E+00 | loss scale: 2048.0 | grad norm: 5.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.803 | TFLOPs: 42.31 | +[default7]: iteration 4361/ 6200 | consumed samples: 4465664 | consumed tokens: 9145679872 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688228E+00 | loss scale: 2048.0 | grad norm: 5.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.610 | TFLOPs: 42.25 | +[default7]: iteration 4362/ 6200 | consumed samples: 4466688 | consumed tokens: 9147777024 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679337E+00 | loss scale: 2048.0 | grad norm: 5.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.899 | TFLOPs: 42.34 | +[default7]: iteration 4363/ 6200 | consumed samples: 4467712 | consumed tokens: 9149874176 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.704796E+00 | loss scale: 2048.0 | grad norm: 6.614 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.807 | TFLOPs: 42.31 | +[default7]: iteration 4364/ 6200 | consumed samples: 4468736 | consumed tokens: 9151971328 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.684556E+00 | loss scale: 2048.0 | grad norm: 5.888 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.480 | TFLOPs: 42.21 | +[default7]: iteration 4365/ 6200 | consumed samples: 4469760 | consumed tokens: 9154068480 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670643E+00 | loss scale: 2048.0 | grad norm: 5.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.303 | TFLOPs: 42.16 | +[default7]: iteration 4366/ 6200 | consumed samples: 4470784 | consumed tokens: 9156165632 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682361E+00 | loss scale: 2048.0 | grad norm: 6.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.471 | TFLOPs: 42.21 | +[default7]: iteration 4367/ 6200 | consumed samples: 4471808 | consumed tokens: 9158262784 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659515E+00 | loss scale: 2048.0 | grad norm: 5.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.437 | TFLOPs: 42.20 | +[default7]: iteration 4368/ 6200 | consumed samples: 4472832 | consumed tokens: 9160359936 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674484E+00 | loss scale: 2048.0 | grad norm: 5.835 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.227 | TFLOPs: 42.13 | +[default7]: iteration 4369/ 6200 | consumed samples: 4473856 | consumed tokens: 9162457088 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686393E+00 | loss scale: 2048.0 | grad norm: 4.766 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.347 | TFLOPs: 41.86 | +[default7]: iteration 4370/ 6200 | consumed samples: 4474880 | consumed tokens: 9164554240 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689882E+00 | loss scale: 2048.0 | grad norm: 4.743 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.221 | TFLOPs: 41.83 | +[default7]: iteration 4371/ 6200 | consumed samples: 4475904 | consumed tokens: 9166651392 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675721E+00 | loss scale: 2048.0 | grad norm: 5.911 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.411 | TFLOPs: 42.19 | +[default7]: iteration 4372/ 6200 | consumed samples: 4476928 | consumed tokens: 9168748544 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673109E+00 | loss scale: 2048.0 | grad norm: 5.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.485 | TFLOPs: 42.21 | +[default7]: iteration 4373/ 6200 | consumed samples: 4477952 | consumed tokens: 9170845696 | elapsed time per iteration (s): 7.46 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696988E+00 | loss scale: 2048.0 | grad norm: 4.985 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.259 | TFLOPs: 41.84 | +[default7]: iteration 4374/ 6200 | consumed samples: 4478976 | consumed tokens: 9172942848 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672147E+00 | loss scale: 2048.0 | grad norm: 6.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.642 | TFLOPs: 41.95 | +[default7]: iteration 4375/ 6200 | consumed samples: 4480000 | consumed tokens: 9175040000 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690793E+00 | loss scale: 2048.0 | grad norm: 6.033 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.682 | TFLOPs: 42.27 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 4375 | lm loss value: 3.614674E+00 | lm loss PPL: 3.713925E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 4375 | lm loss value: 1.571158E+00 | lm loss PPL: 4.812219E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 4376/ 6200 | consumed samples: 4481024 | consumed tokens: 9177137152 | elapsed time per iteration (s): 51.52 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685900E+00 | loss scale: 2048.0 | grad norm: 5.714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.876 | TFLOPs: 6.06 | +[default7]: iteration 4377/ 6200 | consumed samples: 4482048 | consumed tokens: 9179234304 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681814E+00 | loss scale: 2048.0 | grad norm: 5.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.441 | TFLOPs: 42.20 | +[default7]: iteration 4378/ 6200 | consumed samples: 4483072 | consumed tokens: 9181331456 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696506E+00 | loss scale: 2048.0 | grad norm: 5.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.526 | TFLOPs: 42.22 | +[default7]: iteration 4379/ 6200 | consumed samples: 4484096 | consumed tokens: 9183428608 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674500E+00 | loss scale: 2048.0 | grad norm: 4.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.454 | TFLOPs: 42.20 | +[default7]: iteration 4380/ 6200 | consumed samples: 4485120 | consumed tokens: 9185525760 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.684021E+00 | loss scale: 2048.0 | grad norm: 4.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.327 | TFLOPs: 42.16 | +[default7]: iteration 4381/ 6200 | consumed samples: 4486144 | consumed tokens: 9187622912 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690292E+00 | loss scale: 2048.0 | grad norm: 5.725 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.433 | TFLOPs: 42.20 | +[default7]: iteration 4382/ 6200 | consumed samples: 4487168 | consumed tokens: 9189720064 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691605E+00 | loss scale: 2048.0 | grad norm: 4.815 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.466 | TFLOPs: 42.21 | +[default7]: iteration 4383/ 6200 | consumed samples: 4488192 | consumed tokens: 9191817216 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697396E+00 | loss scale: 2048.0 | grad norm: 5.952 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.496 | TFLOPs: 42.21 | +[default7]: iteration 4384/ 6200 | consumed samples: 4489216 | consumed tokens: 9193914368 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688335E+00 | loss scale: 2048.0 | grad norm: 5.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.114 | TFLOPs: 42.10 | +[default7]: iteration 4385/ 6200 | consumed samples: 4490240 | consumed tokens: 9196011520 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709211E+00 | loss scale: 2048.0 | grad norm: 5.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.195 | TFLOPs: 42.12 | +[default7]: iteration 4386/ 6200 | consumed samples: 4491264 | consumed tokens: 9198108672 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689692E+00 | loss scale: 2048.0 | grad norm: 4.980 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.063 | TFLOPs: 42.08 | +[default7]: iteration 4387/ 6200 | consumed samples: 4492288 | consumed tokens: 9200205824 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661289E+00 | loss scale: 2048.0 | grad norm: 5.881 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.965 | TFLOPs: 42.05 | +[default7]: iteration 4388/ 6200 | consumed samples: 4493312 | consumed tokens: 9202302976 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663702E+00 | loss scale: 2048.0 | grad norm: 5.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.545 | TFLOPs: 42.23 | +[default7]: iteration 4389/ 6200 | consumed samples: 4494336 | consumed tokens: 9204400128 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663513E+00 | loss scale: 2048.0 | grad norm: 4.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.649 | TFLOPs: 42.26 | +[default7]: iteration 4390/ 6200 | consumed samples: 4495360 | consumed tokens: 9206497280 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693574E+00 | loss scale: 2048.0 | grad norm: 5.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.210 | TFLOPs: 42.13 | +[default7]: iteration 4391/ 6200 | consumed samples: 4496384 | consumed tokens: 9208594432 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678259E+00 | loss scale: 2048.0 | grad norm: 5.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.296 | TFLOPs: 42.15 | +[default7]: iteration 4392/ 6200 | consumed samples: 4497408 | consumed tokens: 9210691584 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672131E+00 | loss scale: 2048.0 | grad norm: 5.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.542 | TFLOPs: 42.23 | +[default7]: iteration 4393/ 6200 | consumed samples: 4498432 | consumed tokens: 9212788736 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710961E+00 | loss scale: 2048.0 | grad norm: 5.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.468 | TFLOPs: 42.21 | +[default7]: iteration 4394/ 6200 | consumed samples: 4499456 | consumed tokens: 9214885888 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694969E+00 | loss scale: 2048.0 | grad norm: 4.859 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.665 | TFLOPs: 42.27 | +[default7]: iteration 4395/ 6200 | consumed samples: 4500480 | consumed tokens: 9216983040 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696646E+00 | loss scale: 2048.0 | grad norm: 5.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.896 | TFLOPs: 42.34 | +[default7]: iteration 4396/ 6200 | consumed samples: 4501504 | consumed tokens: 9219080192 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675225E+00 | loss scale: 2048.0 | grad norm: 5.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.724 | TFLOPs: 42.28 | +[default7]: iteration 4397/ 6200 | consumed samples: 4502528 | consumed tokens: 9221177344 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673769E+00 | loss scale: 2048.0 | grad norm: 7.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.326 | TFLOPs: 42.16 | +[default7]: iteration 4398/ 6200 | consumed samples: 4503552 | consumed tokens: 9223274496 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685785E+00 | loss scale: 2048.0 | grad norm: 6.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.510 | TFLOPs: 42.22 | +[default7]: iteration 4399/ 6200 | consumed samples: 4504576 | consumed tokens: 9225371648 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696890E+00 | loss scale: 2048.0 | grad norm: 5.004 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.484 | TFLOPs: 42.21 | +[default7]: iteration 4400/ 6200 | consumed samples: 4505600 | consumed tokens: 9227468800 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692308E+00 | loss scale: 2048.0 | grad norm: 5.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.633 | TFLOPs: 42.26 | +[default7]: iteration 4401/ 6200 | consumed samples: 4506624 | consumed tokens: 9229565952 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692293E+00 | loss scale: 2048.0 | grad norm: 5.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.513 | TFLOPs: 42.22 | +[default7]: iteration 4402/ 6200 | consumed samples: 4507648 | consumed tokens: 9231663104 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666564E+00 | loss scale: 2048.0 | grad norm: 6.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.842 | TFLOPs: 42.32 | +[default7]: iteration 4403/ 6200 | consumed samples: 4508672 | consumed tokens: 9233760256 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686638E+00 | loss scale: 2048.0 | grad norm: 4.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.819 | TFLOPs: 42.31 | +[default7]: iteration 4404/ 6200 | consumed samples: 4509696 | consumed tokens: 9235857408 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678947E+00 | loss scale: 2048.0 | grad norm: 5.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.741 | TFLOPs: 42.29 | +[default7]: iteration 4405/ 6200 | consumed samples: 4510720 | consumed tokens: 9237954560 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685622E+00 | loss scale: 2048.0 | grad norm: 5.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.534 | TFLOPs: 42.23 | +[default7]: iteration 4406/ 6200 | consumed samples: 4511744 | consumed tokens: 9240051712 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678386E+00 | loss scale: 2048.0 | grad norm: 7.763 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.555 | TFLOPs: 42.23 | +[default7]: iteration 4407/ 6200 | consumed samples: 4512768 | consumed tokens: 9242148864 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686835E+00 | loss scale: 2048.0 | grad norm: 5.806 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.768 | TFLOPs: 42.30 | +[default7]: iteration 4408/ 6200 | consumed samples: 4513792 | consumed tokens: 9244246016 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690988E+00 | loss scale: 2048.0 | grad norm: 5.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.552 | TFLOPs: 42.23 | +[default7]: iteration 4409/ 6200 | consumed samples: 4514816 | consumed tokens: 9246343168 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667111E+00 | loss scale: 2048.0 | grad norm: 7.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.688 | TFLOPs: 42.27 | +[default7]: iteration 4410/ 6200 | consumed samples: 4515840 | consumed tokens: 9248440320 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682379E+00 | loss scale: 2048.0 | grad norm: 6.856 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.723 | TFLOPs: 42.28 | +[default7]: iteration 4411/ 6200 | consumed samples: 4516864 | consumed tokens: 9250537472 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.726216E+00 | loss scale: 2048.0 | grad norm: 6.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.508 | TFLOPs: 42.22 | +[default7]: iteration 4412/ 6200 | consumed samples: 4517888 | consumed tokens: 9252634624 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674248E+00 | loss scale: 2048.0 | grad norm: 5.679 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.555 | TFLOPs: 42.23 | +[default7]: iteration 4413/ 6200 | consumed samples: 4518912 | consumed tokens: 9254731776 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645594E+00 | loss scale: 2048.0 | grad norm: 6.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.554 | TFLOPs: 42.23 | +[default7]: iteration 4414/ 6200 | consumed samples: 4519936 | consumed tokens: 9256828928 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.710728E+00 | loss scale: 2048.0 | grad norm: 6.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.909 | TFLOPs: 42.34 | +[default7]: iteration 4415/ 6200 | consumed samples: 4520960 | consumed tokens: 9258926080 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675503E+00 | loss scale: 2048.0 | grad norm: 5.657 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.824 | TFLOPs: 42.31 | +[default7]: iteration 4416/ 6200 | consumed samples: 4521984 | consumed tokens: 9261023232 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687272E+00 | loss scale: 2048.0 | grad norm: 5.579 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.680 | TFLOPs: 42.27 | +[default7]: iteration 4417/ 6200 | consumed samples: 4523008 | consumed tokens: 9263120384 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680877E+00 | loss scale: 2048.0 | grad norm: 5.779 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.888 | TFLOPs: 42.33 | +[default7]: iteration 4418/ 6200 | consumed samples: 4524032 | consumed tokens: 9265217536 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670134E+00 | loss scale: 2048.0 | grad norm: 5.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.866 | TFLOPs: 42.33 | +[default7]: iteration 4419/ 6200 | consumed samples: 4525056 | consumed tokens: 9267314688 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679073E+00 | loss scale: 2048.0 | grad norm: 5.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.856 | TFLOPs: 42.32 | +[default7]: iteration 4420/ 6200 | consumed samples: 4526080 | consumed tokens: 9269411840 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688984E+00 | loss scale: 2048.0 | grad norm: 6.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.869 | TFLOPs: 42.33 | +[default7]: iteration 4421/ 6200 | consumed samples: 4527104 | consumed tokens: 9271508992 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695567E+00 | loss scale: 2048.0 | grad norm: 6.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.720 | TFLOPs: 42.28 | +[default7]: iteration 4422/ 6200 | consumed samples: 4528128 | consumed tokens: 9273606144 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680551E+00 | loss scale: 2048.0 | grad norm: 7.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.725 | TFLOPs: 42.28 | +[default7]: iteration 4423/ 6200 | consumed samples: 4529152 | consumed tokens: 9275703296 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708943E+00 | loss scale: 2048.0 | grad norm: 5.737 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.964 | TFLOPs: 42.36 | +[default7]: iteration 4424/ 6200 | consumed samples: 4530176 | consumed tokens: 9277800448 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651639E+00 | loss scale: 2048.0 | grad norm: 5.030 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.012 | TFLOPs: 42.37 | +[default7]: iteration 4425/ 6200 | consumed samples: 4531200 | consumed tokens: 9279897600 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698923E+00 | loss scale: 2048.0 | grad norm: 5.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.050 | TFLOPs: 42.38 | +[default7]: iteration 4426/ 6200 | consumed samples: 4532224 | consumed tokens: 9281994752 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694336E+00 | loss scale: 2048.0 | grad norm: 7.696 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.969 | TFLOPs: 42.36 | +[default7]: iteration 4427/ 6200 | consumed samples: 4533248 | consumed tokens: 9284091904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686542E+00 | loss scale: 2048.0 | grad norm: 6.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.743 | TFLOPs: 42.29 | +[default7]: iteration 4428/ 6200 | consumed samples: 4534272 | consumed tokens: 9286189056 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697840E+00 | loss scale: 2048.0 | grad norm: 6.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.998 | TFLOPs: 42.37 | +[default7]: iteration 4429/ 6200 | consumed samples: 4535296 | consumed tokens: 9288286208 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654610E+00 | loss scale: 2048.0 | grad norm: 5.651 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.581 | TFLOPs: 42.24 | +[default7]: iteration 4430/ 6200 | consumed samples: 4536320 | consumed tokens: 9290383360 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681780E+00 | loss scale: 2048.0 | grad norm: 5.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.653 | TFLOPs: 42.26 | +[default7]: iteration 4431/ 6200 | consumed samples: 4537344 | consumed tokens: 9292480512 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673230E+00 | loss scale: 2048.0 | grad norm: 5.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.549 | TFLOPs: 42.23 | +[default7]: iteration 4432/ 6200 | consumed samples: 4538368 | consumed tokens: 9294577664 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.715240E+00 | loss scale: 2048.0 | grad norm: 5.687 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.548 | TFLOPs: 42.23 | +[default7]: iteration 4433/ 6200 | consumed samples: 4539392 | consumed tokens: 9296674816 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666191E+00 | loss scale: 2048.0 | grad norm: 5.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.548 | TFLOPs: 42.23 | +[default7]: iteration 4434/ 6200 | consumed samples: 4540416 | consumed tokens: 9298771968 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649345E+00 | loss scale: 2048.0 | grad norm: 5.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.774 | TFLOPs: 42.30 | +[default7]: iteration 4435/ 6200 | consumed samples: 4541440 | consumed tokens: 9300869120 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667198E+00 | loss scale: 2048.0 | grad norm: 5.776 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.590 | TFLOPs: 42.24 | +[default7]: iteration 4436/ 6200 | consumed samples: 4542464 | consumed tokens: 9302966272 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677083E+00 | loss scale: 2048.0 | grad norm: 6.749 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.753 | TFLOPs: 42.29 | +[default7]: iteration 4437/ 6200 | consumed samples: 4543488 | consumed tokens: 9305063424 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668044E+00 | loss scale: 2048.0 | grad norm: 4.967 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.998 | TFLOPs: 42.37 | +[default7]: iteration 4438/ 6200 | consumed samples: 4544512 | consumed tokens: 9307160576 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647099E+00 | loss scale: 2048.0 | grad norm: 4.949 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.820 | TFLOPs: 42.31 | +[default7]: iteration 4439/ 6200 | consumed samples: 4545536 | consumed tokens: 9309257728 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682363E+00 | loss scale: 2048.0 | grad norm: 5.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.709 | TFLOPs: 42.28 | +[default7]: iteration 4440/ 6200 | consumed samples: 4546560 | consumed tokens: 9311354880 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699648E+00 | loss scale: 2048.0 | grad norm: 6.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.880 | TFLOPs: 42.33 | +[default7]: iteration 4441/ 6200 | consumed samples: 4547584 | consumed tokens: 9313452032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683529E+00 | loss scale: 2048.0 | grad norm: 4.856 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.948 | TFLOPs: 42.35 | +[default7]: iteration 4442/ 6200 | consumed samples: 4548608 | consumed tokens: 9315549184 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699393E+00 | loss scale: 2048.0 | grad norm: 5.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.208 | TFLOPs: 42.43 | +[default7]: iteration 4443/ 6200 | consumed samples: 4549632 | consumed tokens: 9317646336 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661956E+00 | loss scale: 2048.0 | grad norm: 6.037 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.170 | TFLOPs: 42.42 | +[default7]: iteration 4444/ 6200 | consumed samples: 4550656 | consumed tokens: 9319743488 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673654E+00 | loss scale: 2048.0 | grad norm: 5.076 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.965 | TFLOPs: 42.36 | +[default7]: iteration 4445/ 6200 | consumed samples: 4551680 | consumed tokens: 9321840640 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.684886E+00 | loss scale: 2048.0 | grad norm: 5.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.120 | TFLOPs: 42.40 | +[default7]: iteration 4446/ 6200 | consumed samples: 4552704 | consumed tokens: 9323937792 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678571E+00 | loss scale: 2048.0 | grad norm: 5.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.063 | TFLOPs: 42.39 | +[default7]: iteration 4447/ 6200 | consumed samples: 4553728 | consumed tokens: 9326034944 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690281E+00 | loss scale: 2048.0 | grad norm: 4.863 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.760 | TFLOPs: 42.30 | +[default7]: iteration 4448/ 6200 | consumed samples: 4554752 | consumed tokens: 9328132096 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665549E+00 | loss scale: 2048.0 | grad norm: 5.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.495 | TFLOPs: 42.52 | +[default7]: iteration 4449/ 6200 | consumed samples: 4555776 | consumed tokens: 9330229248 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671393E+00 | loss scale: 2048.0 | grad norm: 5.870 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.053 | TFLOPs: 42.38 | +[default7]: iteration 4450/ 6200 | consumed samples: 4556800 | consumed tokens: 9332326400 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.700642E+00 | loss scale: 2048.0 | grad norm: 4.888 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.934 | TFLOPs: 42.35 | +[default7]: iteration 4451/ 6200 | consumed samples: 4557824 | consumed tokens: 9334423552 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690704E+00 | loss scale: 2048.0 | grad norm: 5.760 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.006 | TFLOPs: 42.37 | +[default7]: iteration 4452/ 6200 | consumed samples: 4558848 | consumed tokens: 9336520704 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702087E+00 | loss scale: 2048.0 | grad norm: 5.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.861 | TFLOPs: 42.33 | +[default7]: iteration 4453/ 6200 | consumed samples: 4559872 | consumed tokens: 9338617856 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689831E+00 | loss scale: 2048.0 | grad norm: 7.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.717 | TFLOPs: 42.28 | +[default7]: iteration 4454/ 6200 | consumed samples: 4560896 | consumed tokens: 9340715008 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661563E+00 | loss scale: 2048.0 | grad norm: 4.928 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.635 | TFLOPs: 42.26 | +[default7]: iteration 4455/ 6200 | consumed samples: 4561920 | consumed tokens: 9342812160 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687943E+00 | loss scale: 2048.0 | grad norm: 5.010 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.003 | TFLOPs: 42.37 | +[default7]: iteration 4456/ 6200 | consumed samples: 4562944 | consumed tokens: 9344909312 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663268E+00 | loss scale: 2048.0 | grad norm: 5.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.941 | TFLOPs: 42.35 | +[default7]: iteration 4457/ 6200 | consumed samples: 4563968 | consumed tokens: 9347006464 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679724E+00 | loss scale: 2048.0 | grad norm: 6.032 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.354 | TFLOPs: 42.48 | +[default7]: iteration 4458/ 6200 | consumed samples: 4564992 | consumed tokens: 9349103616 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674049E+00 | loss scale: 2048.0 | grad norm: 6.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.417 | TFLOPs: 42.50 | +[default7]: iteration 4459/ 6200 | consumed samples: 4566016 | consumed tokens: 9351200768 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639231E+00 | loss scale: 2048.0 | grad norm: 5.857 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.351 | TFLOPs: 42.48 | +[default7]: iteration 4460/ 6200 | consumed samples: 4567040 | consumed tokens: 9353297920 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665676E+00 | loss scale: 2048.0 | grad norm: 5.620 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.342 | TFLOPs: 42.47 | +[default7]: iteration 4461/ 6200 | consumed samples: 4568064 | consumed tokens: 9355395072 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676435E+00 | loss scale: 2048.0 | grad norm: 6.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.333 | TFLOPs: 42.47 | +[default7]: iteration 4462/ 6200 | consumed samples: 4569088 | consumed tokens: 9357492224 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666361E+00 | loss scale: 2048.0 | grad norm: 6.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.441 | TFLOPs: 42.50 | +[default7]: iteration 4463/ 6200 | consumed samples: 4570112 | consumed tokens: 9359589376 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676964E+00 | loss scale: 2048.0 | grad norm: 5.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.351 | TFLOPs: 42.48 | +[default7]: iteration 4464/ 6200 | consumed samples: 4571136 | consumed tokens: 9361686528 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654825E+00 | loss scale: 2048.0 | grad norm: 5.785 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.790 | TFLOPs: 42.30 | +[default7]: iteration 4465/ 6200 | consumed samples: 4572160 | consumed tokens: 9363783680 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682099E+00 | loss scale: 2048.0 | grad norm: 4.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.642 | TFLOPs: 42.26 | +[default7]: iteration 4466/ 6200 | consumed samples: 4573184 | consumed tokens: 9365880832 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658035E+00 | loss scale: 2048.0 | grad norm: 5.979 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.654 | TFLOPs: 42.26 | +[default7]: iteration 4467/ 6200 | consumed samples: 4574208 | consumed tokens: 9367977984 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698195E+00 | loss scale: 2048.0 | grad norm: 5.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.562 | TFLOPs: 42.23 | +[default7]: iteration 4468/ 6200 | consumed samples: 4575232 | consumed tokens: 9370075136 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667692E+00 | loss scale: 2048.0 | grad norm: 5.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.617 | TFLOPs: 42.25 | +[default7]: iteration 4469/ 6200 | consumed samples: 4576256 | consumed tokens: 9372172288 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691390E+00 | loss scale: 2048.0 | grad norm: 4.869 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.570 | TFLOPs: 42.24 | +[default7]: iteration 4470/ 6200 | consumed samples: 4577280 | consumed tokens: 9374269440 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680823E+00 | loss scale: 2048.0 | grad norm: 5.054 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.510 | TFLOPs: 42.22 | +[default7]: iteration 4471/ 6200 | consumed samples: 4578304 | consumed tokens: 9376366592 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699747E+00 | loss scale: 2048.0 | grad norm: 5.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.308 | TFLOPs: 42.16 | +[default7]: iteration 4472/ 6200 | consumed samples: 4579328 | consumed tokens: 9378463744 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648978E+00 | loss scale: 2048.0 | grad norm: 5.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.701 | TFLOPs: 42.28 | +[default7]: iteration 4473/ 6200 | consumed samples: 4580352 | consumed tokens: 9380560896 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658621E+00 | loss scale: 2048.0 | grad norm: 4.857 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.688 | TFLOPs: 42.27 | +[default7]: iteration 4474/ 6200 | consumed samples: 4581376 | consumed tokens: 9382658048 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656183E+00 | loss scale: 2048.0 | grad norm: 5.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.729 | TFLOPs: 42.29 | +[default7]: iteration 4475/ 6200 | consumed samples: 4582400 | consumed tokens: 9384755200 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662367E+00 | loss scale: 2048.0 | grad norm: 4.660 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.767 | TFLOPs: 42.30 | +[default7]: iteration 4476/ 6200 | consumed samples: 4583424 | consumed tokens: 9386852352 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676224E+00 | loss scale: 2048.0 | grad norm: 5.908 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.772 | TFLOPs: 42.30 | +[default7]: iteration 4477/ 6200 | consumed samples: 4584448 | consumed tokens: 9388949504 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667474E+00 | loss scale: 2048.0 | grad norm: 5.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.468 | TFLOPs: 42.21 | +[default7]: iteration 4478/ 6200 | consumed samples: 4585472 | consumed tokens: 9391046656 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688233E+00 | loss scale: 2048.0 | grad norm: 5.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.834 | TFLOPs: 42.32 | +[default7]: iteration 4479/ 6200 | consumed samples: 4586496 | consumed tokens: 9393143808 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678983E+00 | loss scale: 2048.0 | grad norm: 4.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.860 | TFLOPs: 42.33 | +[default7]: iteration 4480/ 6200 | consumed samples: 4587520 | consumed tokens: 9395240960 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668120E+00 | loss scale: 2048.0 | grad norm: 6.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.077 | TFLOPs: 42.39 | +[default7]: iteration 4481/ 6200 | consumed samples: 4588544 | consumed tokens: 9397338112 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652727E+00 | loss scale: 2048.0 | grad norm: 5.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.957 | TFLOPs: 42.36 | +[default7]: iteration 4482/ 6200 | consumed samples: 4589568 | consumed tokens: 9399435264 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677714E+00 | loss scale: 2048.0 | grad norm: 6.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.612 | TFLOPs: 42.25 | +[default7]: iteration 4483/ 6200 | consumed samples: 4590592 | consumed tokens: 9401532416 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656307E+00 | loss scale: 2048.0 | grad norm: 6.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.671 | TFLOPs: 42.27 | +[default7]: iteration 4484/ 6200 | consumed samples: 4591616 | consumed tokens: 9403629568 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639139E+00 | loss scale: 2048.0 | grad norm: 5.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.834 | TFLOPs: 42.32 | +[default7]: iteration 4485/ 6200 | consumed samples: 4592640 | consumed tokens: 9405726720 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670497E+00 | loss scale: 2048.0 | grad norm: 6.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.634 | TFLOPs: 42.26 | +[default7]: iteration 4486/ 6200 | consumed samples: 4593664 | consumed tokens: 9407823872 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679539E+00 | loss scale: 2048.0 | grad norm: 5.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.633 | TFLOPs: 42.26 | +[default7]: iteration 4487/ 6200 | consumed samples: 4594688 | consumed tokens: 9409921024 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668499E+00 | loss scale: 2048.0 | grad norm: 4.894 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.404 | TFLOPs: 41.88 | +[default7]: iteration 4488/ 6200 | consumed samples: 4595712 | consumed tokens: 9412018176 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667443E+00 | loss scale: 2048.0 | grad norm: 5.698 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.455 | TFLOPs: 42.20 | +[default7]: iteration 4489/ 6200 | consumed samples: 4596736 | consumed tokens: 9414115328 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687647E+00 | loss scale: 2048.0 | grad norm: 5.911 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.554 | TFLOPs: 42.23 | +[default7]: iteration 4490/ 6200 | consumed samples: 4597760 | consumed tokens: 9416212480 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694440E+00 | loss scale: 2048.0 | grad norm: 4.855 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.385 | TFLOPs: 42.18 | +[default7]: iteration 4491/ 6200 | consumed samples: 4598784 | consumed tokens: 9418309632 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663987E+00 | loss scale: 2048.0 | grad norm: 5.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.722 | TFLOPs: 42.28 | +[default7]: iteration 4492/ 6200 | consumed samples: 4599808 | consumed tokens: 9420406784 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691188E+00 | loss scale: 2048.0 | grad norm: 6.979 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.186 | TFLOPs: 42.12 | +[default7]: iteration 4493/ 6200 | consumed samples: 4600832 | consumed tokens: 9422503936 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648596E+00 | loss scale: 2048.0 | grad norm: 5.856 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.214 | TFLOPs: 42.13 | +[default7]: iteration 4494/ 6200 | consumed samples: 4601856 | consumed tokens: 9424601088 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657373E+00 | loss scale: 2048.0 | grad norm: 5.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.652 | TFLOPs: 42.26 | +[default7]: iteration 4495/ 6200 | consumed samples: 4602880 | consumed tokens: 9426698240 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645741E+00 | loss scale: 2048.0 | grad norm: 5.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.357 | TFLOPs: 42.17 | +[default7]: iteration 4496/ 6200 | consumed samples: 4603904 | consumed tokens: 9428795392 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708580E+00 | loss scale: 2048.0 | grad norm: 7.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.582 | TFLOPs: 42.24 | +[default7]: iteration 4497/ 6200 | consumed samples: 4604928 | consumed tokens: 9430892544 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690191E+00 | loss scale: 2048.0 | grad norm: 5.959 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.963 | TFLOPs: 42.36 | +[default7]: iteration 4498/ 6200 | consumed samples: 4605952 | consumed tokens: 9432989696 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690455E+00 | loss scale: 2048.0 | grad norm: 6.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.919 | TFLOPs: 42.34 | +[default7]: iteration 4499/ 6200 | consumed samples: 4606976 | consumed tokens: 9435086848 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677984E+00 | loss scale: 2048.0 | grad norm: 5.043 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.647 | TFLOPs: 42.26 | +[default7]: iteration 4500/ 6200 | consumed samples: 4608000 | consumed tokens: 9437184000 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656143E+00 | loss scale: 2048.0 | grad norm: 5.700 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.642 | TFLOPs: 42.26 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 4500 | lm loss value: 3.630320E+00 | lm loss PPL: 3.772490E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 4500 | lm loss value: 1.570621E+00 | lm loss PPL: 4.809634E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 4500 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 22:24:26,111] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4500 is begin to save! +[default0]:[2022-10-06 22:24:26,115] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,546] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,573] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,601] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,630] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,713] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,742] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,797] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,824] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,852] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,910] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,966] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:26,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:26,994] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:27,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:27,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:27,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:27,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:27,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:27,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:27,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:27,105] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:27,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:27,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:27,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:27,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 22:24:27,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:27,196] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 22:24:27,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 22:24:27,198] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/mp_rank_00_model_states.pt +[default0]:[2022-10-06 22:24:27,198] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 22:24:27,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 22:24:27,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 22:24:27,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 22:24:27,391] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 22:24:27,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 22:24:27,398] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 22:24:27,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 22:24:27,401] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 22:24:27,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 22:24:27,397] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 22:24:27,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 22:24:27,452] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 22:24:27,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 22:24:27,385] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 22:24:27,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 22:24:27,483] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 22:24:27,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 22:24:27,442] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 22:24:27,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 22:24:27,505] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 22:24:27,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 22:24:27,503] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 22:24:27,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 22:24:27,503] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 22:24:27,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 22:24:27,530] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 22:24:27,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 22:24:27,526] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 22:24:27,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 22:24:27,467] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 22:24:27,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 22:24:27,511] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 22:24:27,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 22:24:27,532] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 22:24:27,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 22:24:27,559] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 22:24:27,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 22:24:27,532] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 22:24:27,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 22:24:27,504] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 22:24:27,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 22:24:27,512] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 22:24:27,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 22:24:27,522] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 22:24:27,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 22:24:27,493] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 22:24:27,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 22:24:27,514] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 22:24:27,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 22:24:27,534] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 22:24:27,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 22:24:27,585] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 22:24:27,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 22:24:27,531] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 22:24:27,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 22:24:27,515] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 22:24:27,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 22:24:27,536] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 22:24:27,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 22:24:27,533] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 22:24:27,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 22:24:27,590] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default7]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default4]:[2022-10-06 22:24:27,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 22:24:27,573] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default7]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default7]:time (ms) | save-checkpoint: 1526.71 +[default5]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default1]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default4]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default6]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default3]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default5]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default5]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default3]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default3]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default0]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default6]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default0]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default0]: successfully saved checkpoint at iteration 4500 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default2]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default0]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default6]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default5]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default2]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default4]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default7]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default3]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default1]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default7]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default1]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default0]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 22:24:27,636] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4500/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default2]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default2]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default1]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default4]:[2022-10-06 22:24:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4500 is ready now! +[default7]: iteration 4501/ 6200 | consumed samples: 4609024 | consumed tokens: 9439281152 | elapsed time per iteration (s): 53.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666706E+00 | loss scale: 2048.0 | grad norm: 6.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.181 | TFLOPs: 5.85 | +[default7]: iteration 4502/ 6200 | consumed samples: 4610048 | consumed tokens: 9441378304 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689284E+00 | loss scale: 2048.0 | grad norm: 7.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.755 | TFLOPs: 42.29 | +[default7]: iteration 4503/ 6200 | consumed samples: 4611072 | consumed tokens: 9443475456 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644881E+00 | loss scale: 2048.0 | grad norm: 5.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.645 | TFLOPs: 42.26 | +[default7]: iteration 4504/ 6200 | consumed samples: 4612096 | consumed tokens: 9445572608 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.702640E+00 | loss scale: 2048.0 | grad norm: 6.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.342 | TFLOPs: 42.17 | +[default7]: iteration 4505/ 6200 | consumed samples: 4613120 | consumed tokens: 9447669760 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668916E+00 | loss scale: 2048.0 | grad norm: 5.692 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.556 | TFLOPs: 42.23 | +[default7]: iteration 4506/ 6200 | consumed samples: 4614144 | consumed tokens: 9449766912 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686926E+00 | loss scale: 2048.0 | grad norm: 5.754 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.263 | TFLOPs: 42.14 | +[default7]: iteration 4507/ 6200 | consumed samples: 4615168 | consumed tokens: 9451864064 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705227E+00 | loss scale: 2048.0 | grad norm: 5.842 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.238 | TFLOPs: 42.14 | +[default7]: iteration 4508/ 6200 | consumed samples: 4616192 | consumed tokens: 9453961216 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670745E+00 | loss scale: 2048.0 | grad norm: 6.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.417 | TFLOPs: 42.19 | +[default7]: iteration 4509/ 6200 | consumed samples: 4617216 | consumed tokens: 9456058368 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668109E+00 | loss scale: 2048.0 | grad norm: 5.804 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.755 | TFLOPs: 42.29 | +[default7]: iteration 4510/ 6200 | consumed samples: 4618240 | consumed tokens: 9458155520 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682027E+00 | loss scale: 2048.0 | grad norm: 5.842 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.531 | TFLOPs: 42.23 | +[default7]: iteration 4511/ 6200 | consumed samples: 4619264 | consumed tokens: 9460252672 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693184E+00 | loss scale: 2048.0 | grad norm: 6.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.275 | TFLOPs: 42.15 | +[default7]: iteration 4512/ 6200 | consumed samples: 4620288 | consumed tokens: 9462349824 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671115E+00 | loss scale: 2048.0 | grad norm: 5.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.540 | TFLOPs: 42.23 | +[default7]: iteration 4513/ 6200 | consumed samples: 4621312 | consumed tokens: 9464446976 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651620E+00 | loss scale: 2048.0 | grad norm: 4.966 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.456 | TFLOPs: 42.20 | +[default7]: iteration 4514/ 6200 | consumed samples: 4622336 | consumed tokens: 9466544128 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683830E+00 | loss scale: 2048.0 | grad norm: 4.830 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.382 | TFLOPs: 42.18 | +[default7]: iteration 4515/ 6200 | consumed samples: 4623360 | consumed tokens: 9468641280 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682496E+00 | loss scale: 2048.0 | grad norm: 4.822 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.731 | TFLOPs: 42.29 | +[default7]: iteration 4516/ 6200 | consumed samples: 4624384 | consumed tokens: 9470738432 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662438E+00 | loss scale: 2048.0 | grad norm: 5.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.568 | TFLOPs: 42.24 | +[default7]: iteration 4517/ 6200 | consumed samples: 4625408 | consumed tokens: 9472835584 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648575E+00 | loss scale: 2048.0 | grad norm: 6.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.752 | TFLOPs: 42.29 | +[default7]: iteration 4518/ 6200 | consumed samples: 4626432 | consumed tokens: 9474932736 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689309E+00 | loss scale: 2048.0 | grad norm: 6.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.930 | TFLOPs: 42.35 | +[default7]: iteration 4519/ 6200 | consumed samples: 4627456 | consumed tokens: 9477029888 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683881E+00 | loss scale: 2048.0 | grad norm: 4.985 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.613 | TFLOPs: 42.25 | +[default7]: iteration 4520/ 6200 | consumed samples: 4628480 | consumed tokens: 9479127040 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701844E+00 | loss scale: 2048.0 | grad norm: 5.023 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.824 | TFLOPs: 42.31 | +[default7]: iteration 4521/ 6200 | consumed samples: 4629504 | consumed tokens: 9481224192 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667768E+00 | loss scale: 2048.0 | grad norm: 5.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.642 | TFLOPs: 42.26 | +[default7]: iteration 4522/ 6200 | consumed samples: 4630528 | consumed tokens: 9483321344 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664247E+00 | loss scale: 2048.0 | grad norm: 6.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.455 | TFLOPs: 42.20 | +[default7]: iteration 4523/ 6200 | consumed samples: 4631552 | consumed tokens: 9485418496 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.655553E+00 | loss scale: 2048.0 | grad norm: 4.588 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.396 | TFLOPs: 42.18 | +[default7]: iteration 4524/ 6200 | consumed samples: 4632576 | consumed tokens: 9487515648 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668869E+00 | loss scale: 2048.0 | grad norm: 5.857 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.461 | TFLOPs: 42.20 | +[default7]: iteration 4525/ 6200 | consumed samples: 4633600 | consumed tokens: 9489612800 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689887E+00 | loss scale: 2048.0 | grad norm: 4.784 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.433 | TFLOPs: 42.20 | +[default7]: iteration 4526/ 6200 | consumed samples: 4634624 | consumed tokens: 9491709952 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666828E+00 | loss scale: 2048.0 | grad norm: 5.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.510 | TFLOPs: 42.22 | +[default7]: iteration 4527/ 6200 | consumed samples: 4635648 | consumed tokens: 9493807104 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680978E+00 | loss scale: 2048.0 | grad norm: 7.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.420 | TFLOPs: 42.19 | +[default7]: iteration 4528/ 6200 | consumed samples: 4636672 | consumed tokens: 9495904256 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683478E+00 | loss scale: 2048.0 | grad norm: 6.629 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.473 | TFLOPs: 42.21 | +[default7]: iteration 4529/ 6200 | consumed samples: 4637696 | consumed tokens: 9498001408 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681067E+00 | loss scale: 2048.0 | grad norm: 6.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.415 | TFLOPs: 42.19 | +[default7]: iteration 4530/ 6200 | consumed samples: 4638720 | consumed tokens: 9500098560 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671063E+00 | loss scale: 2048.0 | grad norm: 5.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.550 | TFLOPs: 42.23 | +[default7]: iteration 4531/ 6200 | consumed samples: 4639744 | consumed tokens: 9502195712 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660988E+00 | loss scale: 2048.0 | grad norm: 7.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.627 | TFLOPs: 42.25 | +[default7]: iteration 4532/ 6200 | consumed samples: 4640768 | consumed tokens: 9504292864 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698111E+00 | loss scale: 2048.0 | grad norm: 7.953 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.654 | TFLOPs: 42.26 | +[default7]: iteration 4533/ 6200 | consumed samples: 4641792 | consumed tokens: 9506390016 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675216E+00 | loss scale: 2048.0 | grad norm: 6.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.633 | TFLOPs: 42.26 | +[default7]: iteration 4534/ 6200 | consumed samples: 4642816 | consumed tokens: 9508487168 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648880E+00 | loss scale: 2048.0 | grad norm: 5.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.388 | TFLOPs: 42.18 | +[default7]: iteration 4535/ 6200 | consumed samples: 4643840 | consumed tokens: 9510584320 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648473E+00 | loss scale: 2048.0 | grad norm: 6.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.055 | TFLOPs: 42.08 | +[default7]: iteration 4536/ 6200 | consumed samples: 4644864 | consumed tokens: 9512681472 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647295E+00 | loss scale: 2048.0 | grad norm: 9.625 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.421 | TFLOPs: 42.19 | +[default7]: iteration 4537/ 6200 | consumed samples: 4645888 | consumed tokens: 9514778624 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692217E+00 | loss scale: 2048.0 | grad norm: 6.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.358 | TFLOPs: 42.17 | +[default7]: iteration 4538/ 6200 | consumed samples: 4646912 | consumed tokens: 9516875776 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674246E+00 | loss scale: 2048.0 | grad norm: 5.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.480 | TFLOPs: 42.21 | +[default7]: iteration 4539/ 6200 | consumed samples: 4647936 | consumed tokens: 9518972928 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665997E+00 | loss scale: 2048.0 | grad norm: 5.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.305 | TFLOPs: 42.16 | +[default7]: iteration 4540/ 6200 | consumed samples: 4648960 | consumed tokens: 9521070080 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660307E+00 | loss scale: 2048.0 | grad norm: 6.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.786 | TFLOPs: 42.30 | +[default7]: iteration 4541/ 6200 | consumed samples: 4649984 | consumed tokens: 9523167232 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667586E+00 | loss scale: 2048.0 | grad norm: 4.619 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.861 | TFLOPs: 42.33 | +[default7]: iteration 4542/ 6200 | consumed samples: 4651008 | consumed tokens: 9525264384 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669817E+00 | loss scale: 2048.0 | grad norm: 5.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.832 | TFLOPs: 42.32 | +[default7]: iteration 4543/ 6200 | consumed samples: 4652032 | consumed tokens: 9527361536 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663056E+00 | loss scale: 2048.0 | grad norm: 5.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.529 | TFLOPs: 42.22 | +[default7]: iteration 4544/ 6200 | consumed samples: 4653056 | consumed tokens: 9529458688 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697144E+00 | loss scale: 2048.0 | grad norm: 5.627 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.817 | TFLOPs: 42.31 | +[default7]: iteration 4545/ 6200 | consumed samples: 4654080 | consumed tokens: 9531555840 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666785E+00 | loss scale: 2048.0 | grad norm: 4.959 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.587 | TFLOPs: 42.24 | +[default7]: iteration 4546/ 6200 | consumed samples: 4655104 | consumed tokens: 9533652992 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678980E+00 | loss scale: 2048.0 | grad norm: 4.811 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.490 | TFLOPs: 42.21 | +[default7]: iteration 4547/ 6200 | consumed samples: 4656128 | consumed tokens: 9535750144 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.707650E+00 | loss scale: 2048.0 | grad norm: 6.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.660 | TFLOPs: 42.26 | +[default7]: iteration 4548/ 6200 | consumed samples: 4657152 | consumed tokens: 9537847296 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668311E+00 | loss scale: 2048.0 | grad norm: 5.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.625 | TFLOPs: 42.25 | +[default7]: iteration 4549/ 6200 | consumed samples: 4658176 | consumed tokens: 9539944448 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660619E+00 | loss scale: 2048.0 | grad norm: 6.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.628 | TFLOPs: 42.26 | +[default7]: iteration 4550/ 6200 | consumed samples: 4659200 | consumed tokens: 9542041600 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672856E+00 | loss scale: 2048.0 | grad norm: 5.886 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.849 | TFLOPs: 42.32 | +[default7]: iteration 4551/ 6200 | consumed samples: 4660224 | consumed tokens: 9544138752 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662834E+00 | loss scale: 2048.0 | grad norm: 6.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.742 | TFLOPs: 42.29 | +[default7]: iteration 4552/ 6200 | consumed samples: 4661248 | consumed tokens: 9546235904 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672009E+00 | loss scale: 2048.0 | grad norm: 5.942 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.549 | TFLOPs: 42.23 | +[default7]: iteration 4553/ 6200 | consumed samples: 4662272 | consumed tokens: 9548333056 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677698E+00 | loss scale: 2048.0 | grad norm: 5.686 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.592 | TFLOPs: 42.24 | +[default7]: iteration 4554/ 6200 | consumed samples: 4663296 | consumed tokens: 9550430208 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.703433E+00 | loss scale: 2048.0 | grad norm: 4.855 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.047 | TFLOPs: 42.38 | +[default7]: iteration 4555/ 6200 | consumed samples: 4664320 | consumed tokens: 9552527360 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.723078E+00 | loss scale: 2048.0 | grad norm: 5.737 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.096 | TFLOPs: 42.40 | +[default7]: iteration 4556/ 6200 | consumed samples: 4665344 | consumed tokens: 9554624512 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683265E+00 | loss scale: 2048.0 | grad norm: 6.019 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.926 | TFLOPs: 42.35 | +[default7]: iteration 4557/ 6200 | consumed samples: 4666368 | consumed tokens: 9556721664 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686879E+00 | loss scale: 2048.0 | grad norm: 5.776 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.042 | TFLOPs: 42.38 | +[default7]: iteration 4558/ 6200 | consumed samples: 4667392 | consumed tokens: 9558818816 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678451E+00 | loss scale: 2048.0 | grad norm: 6.025 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.800 | TFLOPs: 42.31 | +[default7]: iteration 4559/ 6200 | consumed samples: 4668416 | consumed tokens: 9560915968 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693939E+00 | loss scale: 2048.0 | grad norm: 5.944 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.916 | TFLOPs: 42.34 | +[default7]: iteration 4560/ 6200 | consumed samples: 4669440 | consumed tokens: 9563013120 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657549E+00 | loss scale: 2048.0 | grad norm: 5.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.969 | TFLOPs: 42.36 | +[default7]: iteration 4561/ 6200 | consumed samples: 4670464 | consumed tokens: 9565110272 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690998E+00 | loss scale: 2048.0 | grad norm: 5.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.269 | TFLOPs: 42.15 | +[default7]: iteration 4562/ 6200 | consumed samples: 4671488 | consumed tokens: 9567207424 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660332E+00 | loss scale: 2048.0 | grad norm: 4.843 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.323 | TFLOPs: 42.16 | +[default7]: iteration 4563/ 6200 | consumed samples: 4672512 | consumed tokens: 9569304576 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671915E+00 | loss scale: 2048.0 | grad norm: 5.955 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.548 | TFLOPs: 42.23 | +[default7]: iteration 4564/ 6200 | consumed samples: 4673536 | consumed tokens: 9571401728 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692896E+00 | loss scale: 2048.0 | grad norm: 5.569 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.263 | TFLOPs: 42.14 | +[default7]: iteration 4565/ 6200 | consumed samples: 4674560 | consumed tokens: 9573498880 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677314E+00 | loss scale: 2048.0 | grad norm: 5.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.485 | TFLOPs: 42.21 | +[default7]: iteration 4566/ 6200 | consumed samples: 4675584 | consumed tokens: 9575596032 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686264E+00 | loss scale: 2048.0 | grad norm: 5.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.410 | TFLOPs: 42.19 | +[default7]: iteration 4567/ 6200 | consumed samples: 4676608 | consumed tokens: 9577693184 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696404E+00 | loss scale: 2048.0 | grad norm: 5.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.261 | TFLOPs: 42.14 | +[default7]: iteration 4568/ 6200 | consumed samples: 4677632 | consumed tokens: 9579790336 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672497E+00 | loss scale: 2048.0 | grad norm: 4.960 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.604 | TFLOPs: 42.25 | +[default7]: iteration 4569/ 6200 | consumed samples: 4678656 | consumed tokens: 9581887488 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677522E+00 | loss scale: 2048.0 | grad norm: 5.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.724 | TFLOPs: 42.28 | +[default7]: iteration 4570/ 6200 | consumed samples: 4679680 | consumed tokens: 9583984640 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681365E+00 | loss scale: 2048.0 | grad norm: 5.680 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.456 | TFLOPs: 42.20 | +[default7]: iteration 4571/ 6200 | consumed samples: 4680704 | consumed tokens: 9586081792 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693140E+00 | loss scale: 2048.0 | grad norm: 5.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.480 | TFLOPs: 42.21 | +[default7]: iteration 4572/ 6200 | consumed samples: 4681728 | consumed tokens: 9588178944 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668614E+00 | loss scale: 2048.0 | grad norm: 5.740 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.568 | TFLOPs: 42.24 | +[default7]: iteration 4573/ 6200 | consumed samples: 4682752 | consumed tokens: 9590276096 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676419E+00 | loss scale: 2048.0 | grad norm: 4.939 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.166 | TFLOPs: 42.11 | +[default7]: iteration 4574/ 6200 | consumed samples: 4683776 | consumed tokens: 9592373248 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.684791E+00 | loss scale: 2048.0 | grad norm: 4.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.682 | TFLOPs: 42.27 | +[default7]: iteration 4575/ 6200 | consumed samples: 4684800 | consumed tokens: 9594470400 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662351E+00 | loss scale: 2048.0 | grad norm: 5.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.521 | TFLOPs: 42.22 | +[default7]: iteration 4576/ 6200 | consumed samples: 4685824 | consumed tokens: 9596567552 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630146E+00 | loss scale: 2048.0 | grad norm: 6.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.636 | TFLOPs: 42.26 | +[default7]: iteration 4577/ 6200 | consumed samples: 4686848 | consumed tokens: 9598664704 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650070E+00 | loss scale: 2048.0 | grad norm: 4.629 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.649 | TFLOPs: 42.26 | +[default7]: iteration 4578/ 6200 | consumed samples: 4687872 | consumed tokens: 9600761856 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678895E+00 | loss scale: 2048.0 | grad norm: 5.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.587 | TFLOPs: 42.24 | +[default7]: iteration 4579/ 6200 | consumed samples: 4688896 | consumed tokens: 9602859008 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682857E+00 | loss scale: 2048.0 | grad norm: 5.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.380 | TFLOPs: 42.18 | +[default7]: iteration 4580/ 6200 | consumed samples: 4689920 | consumed tokens: 9604956160 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673198E+00 | loss scale: 2048.0 | grad norm: 5.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.779 | TFLOPs: 42.30 | +[default7]: iteration 4581/ 6200 | consumed samples: 4690944 | consumed tokens: 9607053312 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668124E+00 | loss scale: 2048.0 | grad norm: 5.010 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.391 | TFLOPs: 42.18 | +[default7]: iteration 4582/ 6200 | consumed samples: 4691968 | consumed tokens: 9609150464 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691631E+00 | loss scale: 2048.0 | grad norm: 5.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.278 | TFLOPs: 42.15 | +[default7]: iteration 4583/ 6200 | consumed samples: 4692992 | consumed tokens: 9611247616 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667346E+00 | loss scale: 2048.0 | grad norm: 6.732 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.652 | TFLOPs: 42.26 | +[default7]: iteration 4584/ 6200 | consumed samples: 4694016 | consumed tokens: 9613344768 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.691298E+00 | loss scale: 2048.0 | grad norm: 5.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.569 | TFLOPs: 42.24 | +[default7]: iteration 4585/ 6200 | consumed samples: 4695040 | consumed tokens: 9615441920 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690399E+00 | loss scale: 2048.0 | grad norm: 4.728 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.625 | TFLOPs: 42.25 | +[default7]: iteration 4586/ 6200 | consumed samples: 4696064 | consumed tokens: 9617539072 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639013E+00 | loss scale: 2048.0 | grad norm: 6.857 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.729 | TFLOPs: 42.29 | +[default7]: iteration 4587/ 6200 | consumed samples: 4697088 | consumed tokens: 9619636224 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683756E+00 | loss scale: 2048.0 | grad norm: 6.993 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.878 | TFLOPs: 42.33 | +[default7]: iteration 4588/ 6200 | consumed samples: 4698112 | consumed tokens: 9621733376 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666820E+00 | loss scale: 2048.0 | grad norm: 5.024 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.805 | TFLOPs: 42.31 | +[default7]: iteration 4589/ 6200 | consumed samples: 4699136 | consumed tokens: 9623830528 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652601E+00 | loss scale: 2048.0 | grad norm: 4.803 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.066 | TFLOPs: 42.39 | +[default7]: iteration 4590/ 6200 | consumed samples: 4700160 | consumed tokens: 9625927680 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697819E+00 | loss scale: 2048.0 | grad norm: 5.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.224 | TFLOPs: 42.44 | +[default7]: iteration 4591/ 6200 | consumed samples: 4701184 | consumed tokens: 9628024832 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649277E+00 | loss scale: 2048.0 | grad norm: 5.924 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.084 | TFLOPs: 42.39 | +[default7]: iteration 4592/ 6200 | consumed samples: 4702208 | consumed tokens: 9630121984 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.692878E+00 | loss scale: 2048.0 | grad norm: 5.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.264 | TFLOPs: 42.45 | +[default7]: iteration 4593/ 6200 | consumed samples: 4703232 | consumed tokens: 9632219136 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671001E+00 | loss scale: 2048.0 | grad norm: 5.675 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.174 | TFLOPs: 42.42 | +[default7]: iteration 4594/ 6200 | consumed samples: 4704256 | consumed tokens: 9634316288 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657851E+00 | loss scale: 2048.0 | grad norm: 5.955 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.318 | TFLOPs: 42.47 | +[default7]: iteration 4595/ 6200 | consumed samples: 4705280 | consumed tokens: 9636413440 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708924E+00 | loss scale: 2048.0 | grad norm: 6.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.468 | TFLOPs: 42.51 | +[default7]: iteration 4596/ 6200 | consumed samples: 4706304 | consumed tokens: 9638510592 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699661E+00 | loss scale: 2048.0 | grad norm: 5.914 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.142 | TFLOPs: 42.41 | +[default7]: iteration 4597/ 6200 | consumed samples: 4707328 | consumed tokens: 9640607744 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682899E+00 | loss scale: 2048.0 | grad norm: 4.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.998 | TFLOPs: 42.37 | +[default7]: iteration 4598/ 6200 | consumed samples: 4708352 | consumed tokens: 9642704896 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685268E+00 | loss scale: 2048.0 | grad norm: 4.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.839 | TFLOPs: 42.32 | +[default7]: iteration 4599/ 6200 | consumed samples: 4709376 | consumed tokens: 9644802048 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666932E+00 | loss scale: 2048.0 | grad norm: 5.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.999 | TFLOPs: 42.37 | +[default7]: iteration 4600/ 6200 | consumed samples: 4710400 | consumed tokens: 9646899200 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671980E+00 | loss scale: 2048.0 | grad norm: 5.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.984 | TFLOPs: 42.36 | +[default7]: iteration 4601/ 6200 | consumed samples: 4711424 | consumed tokens: 9648996352 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641348E+00 | loss scale: 2048.0 | grad norm: 5.878 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.934 | TFLOPs: 42.35 | +[default7]: iteration 4602/ 6200 | consumed samples: 4712448 | consumed tokens: 9651093504 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661011E+00 | loss scale: 2048.0 | grad norm: 4.791 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.034 | TFLOPs: 42.38 | +[default7]: iteration 4603/ 6200 | consumed samples: 4713472 | consumed tokens: 9653190656 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653661E+00 | loss scale: 2048.0 | grad norm: 5.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.748 | TFLOPs: 42.29 | +[default7]: iteration 4604/ 6200 | consumed samples: 4714496 | consumed tokens: 9655287808 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679630E+00 | loss scale: 2048.0 | grad norm: 4.887 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.397 | TFLOPs: 42.18 | +[default7]: iteration 4605/ 6200 | consumed samples: 4715520 | consumed tokens: 9657384960 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654260E+00 | loss scale: 2048.0 | grad norm: 5.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.919 | TFLOPs: 42.34 | +[default7]: iteration 4606/ 6200 | consumed samples: 4716544 | consumed tokens: 9659482112 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663919E+00 | loss scale: 2048.0 | grad norm: 5.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.716 | TFLOPs: 42.28 | +[default7]: iteration 4607/ 6200 | consumed samples: 4717568 | consumed tokens: 9661579264 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665776E+00 | loss scale: 2048.0 | grad norm: 4.985 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.644 | TFLOPs: 42.26 | +[default7]: iteration 4608/ 6200 | consumed samples: 4718592 | consumed tokens: 9663676416 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665964E+00 | loss scale: 2048.0 | grad norm: 5.953 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.779 | TFLOPs: 42.30 | +[default7]: iteration 4609/ 6200 | consumed samples: 4719616 | consumed tokens: 9665773568 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654204E+00 | loss scale: 2048.0 | grad norm: 5.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.797 | TFLOPs: 42.31 | +[default7]: iteration 4610/ 6200 | consumed samples: 4720640 | consumed tokens: 9667870720 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697188E+00 | loss scale: 2048.0 | grad norm: 6.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.865 | TFLOPs: 42.33 | +[default7]: iteration 4611/ 6200 | consumed samples: 4721664 | consumed tokens: 9669967872 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663305E+00 | loss scale: 2048.0 | grad norm: 5.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.705 | TFLOPs: 42.28 | +[default7]: iteration 4612/ 6200 | consumed samples: 4722688 | consumed tokens: 9672065024 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662663E+00 | loss scale: 2048.0 | grad norm: 5.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.962 | TFLOPs: 42.36 | +[default7]: iteration 4613/ 6200 | consumed samples: 4723712 | consumed tokens: 9674162176 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675112E+00 | loss scale: 2048.0 | grad norm: 5.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.905 | TFLOPs: 42.34 | +[default7]: iteration 4614/ 6200 | consumed samples: 4724736 | consumed tokens: 9676259328 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672285E+00 | loss scale: 2048.0 | grad norm: 5.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.010 | TFLOPs: 42.37 | +[default7]: iteration 4615/ 6200 | consumed samples: 4725760 | consumed tokens: 9678356480 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.714248E+00 | loss scale: 2048.0 | grad norm: 4.996 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.807 | TFLOPs: 42.31 | +[default7]: iteration 4616/ 6200 | consumed samples: 4726784 | consumed tokens: 9680453632 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674499E+00 | loss scale: 2048.0 | grad norm: 5.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.874 | TFLOPs: 42.33 | +[default7]: iteration 4617/ 6200 | consumed samples: 4727808 | consumed tokens: 9682550784 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667477E+00 | loss scale: 2048.0 | grad norm: 5.687 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.855 | TFLOPs: 42.32 | +[default7]: iteration 4618/ 6200 | consumed samples: 4728832 | consumed tokens: 9684647936 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659099E+00 | loss scale: 2048.0 | grad norm: 4.933 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.844 | TFLOPs: 42.32 | +[default7]: iteration 4619/ 6200 | consumed samples: 4729856 | consumed tokens: 9686745088 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695890E+00 | loss scale: 2048.0 | grad norm: 5.619 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.868 | TFLOPs: 42.33 | +[default7]: iteration 4620/ 6200 | consumed samples: 4730880 | consumed tokens: 9688842240 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682224E+00 | loss scale: 2048.0 | grad norm: 6.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.784 | TFLOPs: 42.30 | +[default7]: iteration 4621/ 6200 | consumed samples: 4731904 | consumed tokens: 9690939392 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648804E+00 | loss scale: 2048.0 | grad norm: 4.752 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.794 | TFLOPs: 42.31 | +[default7]: iteration 4622/ 6200 | consumed samples: 4732928 | consumed tokens: 9693036544 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678188E+00 | loss scale: 2048.0 | grad norm: 7.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.024 | TFLOPs: 42.38 | +[default7]: iteration 4623/ 6200 | consumed samples: 4733952 | consumed tokens: 9695133696 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685502E+00 | loss scale: 2048.0 | grad norm: 5.003 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.136 | TFLOPs: 42.41 | +[default7]: iteration 4624/ 6200 | consumed samples: 4734976 | consumed tokens: 9697230848 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656016E+00 | loss scale: 2048.0 | grad norm: 4.984 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.845 | TFLOPs: 42.32 | +[default7]: iteration 4625/ 6200 | consumed samples: 4736000 | consumed tokens: 9699328000 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661140E+00 | loss scale: 2048.0 | grad norm: 5.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.982 | TFLOPs: 42.36 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 4625 | lm loss value: 3.644848E+00 | lm loss PPL: 3.827694E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 4625 | lm loss value: 1.567772E+00 | lm loss PPL: 4.795953E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 4626/ 6200 | consumed samples: 4737024 | consumed tokens: 9701425152 | elapsed time per iteration (s): 51.81 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657804E+00 | loss scale: 2048.0 | grad norm: 5.665 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.765 | TFLOPs: 6.02 | +[default7]: iteration 4627/ 6200 | consumed samples: 4738048 | consumed tokens: 9703522304 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668813E+00 | loss scale: 2048.0 | grad norm: 5.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.349 | TFLOPs: 42.17 | +[default7]: iteration 4628/ 6200 | consumed samples: 4739072 | consumed tokens: 9705619456 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683748E+00 | loss scale: 2048.0 | grad norm: 6.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.665 | TFLOPs: 42.27 | +[default7]: iteration 4629/ 6200 | consumed samples: 4740096 | consumed tokens: 9707716608 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671854E+00 | loss scale: 2048.0 | grad norm: 5.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.048 | TFLOPs: 42.38 | +[default7]: iteration 4630/ 6200 | consumed samples: 4741120 | consumed tokens: 9709813760 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668189E+00 | loss scale: 2048.0 | grad norm: 5.710 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.975 | TFLOPs: 42.36 | +[default7]: iteration 4631/ 6200 | consumed samples: 4742144 | consumed tokens: 9711910912 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686169E+00 | loss scale: 2048.0 | grad norm: 6.027 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.679 | TFLOPs: 42.27 | +[default7]: iteration 4632/ 6200 | consumed samples: 4743168 | consumed tokens: 9714008064 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676292E+00 | loss scale: 2048.0 | grad norm: 6.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.889 | TFLOPs: 42.33 | +[default7]: iteration 4633/ 6200 | consumed samples: 4744192 | consumed tokens: 9716105216 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641767E+00 | loss scale: 2048.0 | grad norm: 5.588 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.702 | TFLOPs: 42.28 | +[default7]: iteration 4634/ 6200 | consumed samples: 4745216 | consumed tokens: 9718202368 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676483E+00 | loss scale: 2048.0 | grad norm: 6.045 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.027 | TFLOPs: 42.38 | +[default7]: iteration 4635/ 6200 | consumed samples: 4746240 | consumed tokens: 9720299520 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634548E+00 | loss scale: 2048.0 | grad norm: 4.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.813 | TFLOPs: 42.31 | +[default7]: iteration 4636/ 6200 | consumed samples: 4747264 | consumed tokens: 9722396672 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687739E+00 | loss scale: 2048.0 | grad norm: 5.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.411 | TFLOPs: 42.19 | +[default7]: iteration 4637/ 6200 | consumed samples: 4748288 | consumed tokens: 9724493824 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688634E+00 | loss scale: 2048.0 | grad norm: 5.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.576 | TFLOPs: 42.24 | +[default7]: iteration 4638/ 6200 | consumed samples: 4749312 | consumed tokens: 9726590976 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661577E+00 | loss scale: 2048.0 | grad norm: 5.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.757 | TFLOPs: 42.29 | +[default7]: iteration 4639/ 6200 | consumed samples: 4750336 | consumed tokens: 9728688128 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663073E+00 | loss scale: 2048.0 | grad norm: 8.745 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.771 | TFLOPs: 42.30 | +[default7]: iteration 4640/ 6200 | consumed samples: 4751360 | consumed tokens: 9730785280 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669799E+00 | loss scale: 2048.0 | grad norm: 6.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.684 | TFLOPs: 42.27 | +[default7]: iteration 4641/ 6200 | consumed samples: 4752384 | consumed tokens: 9732882432 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680171E+00 | loss scale: 2048.0 | grad norm: 5.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.983 | TFLOPs: 42.36 | +[default7]: iteration 4642/ 6200 | consumed samples: 4753408 | consumed tokens: 9734979584 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661692E+00 | loss scale: 2048.0 | grad norm: 5.781 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.108 | TFLOPs: 42.40 | +[default7]: iteration 4643/ 6200 | consumed samples: 4754432 | consumed tokens: 9737076736 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665598E+00 | loss scale: 2048.0 | grad norm: 9.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.120 | TFLOPs: 42.40 | +[default7]: iteration 4644/ 6200 | consumed samples: 4755456 | consumed tokens: 9739173888 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660993E+00 | loss scale: 2048.0 | grad norm: 7.779 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.040 | TFLOPs: 42.38 | +[default7]: iteration 4645/ 6200 | consumed samples: 4756480 | consumed tokens: 9741271040 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654651E+00 | loss scale: 2048.0 | grad norm: 6.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.030 | TFLOPs: 42.38 | +[default7]: iteration 4646/ 6200 | consumed samples: 4757504 | consumed tokens: 9743368192 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669083E+00 | loss scale: 2048.0 | grad norm: 6.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.691 | TFLOPs: 42.27 | +[default7]: iteration 4647/ 6200 | consumed samples: 4758528 | consumed tokens: 9745465344 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689347E+00 | loss scale: 2048.0 | grad norm: 7.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.958 | TFLOPs: 42.36 | +[default7]: iteration 4648/ 6200 | consumed samples: 4759552 | consumed tokens: 9747562496 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665579E+00 | loss scale: 2048.0 | grad norm: 4.956 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.028 | TFLOPs: 42.38 | +[default7]: iteration 4649/ 6200 | consumed samples: 4760576 | consumed tokens: 9749659648 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690180E+00 | loss scale: 2048.0 | grad norm: 4.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.986 | TFLOPs: 42.36 | +[default7]: iteration 4650/ 6200 | consumed samples: 4761600 | consumed tokens: 9751756800 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652709E+00 | loss scale: 2048.0 | grad norm: 5.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.845 | TFLOPs: 42.32 | +[default7]: iteration 4651/ 6200 | consumed samples: 4762624 | consumed tokens: 9753853952 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668131E+00 | loss scale: 2048.0 | grad norm: 5.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.816 | TFLOPs: 42.31 | +[default7]: iteration 4652/ 6200 | consumed samples: 4763648 | consumed tokens: 9755951104 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688019E+00 | loss scale: 2048.0 | grad norm: 5.780 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.661 | TFLOPs: 42.27 | +[default7]: iteration 4653/ 6200 | consumed samples: 4764672 | consumed tokens: 9758048256 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689040E+00 | loss scale: 2048.0 | grad norm: 4.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.062 | TFLOPs: 42.39 | +[default7]: iteration 4654/ 6200 | consumed samples: 4765696 | consumed tokens: 9760145408 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624276E+00 | loss scale: 2048.0 | grad norm: 5.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.907 | TFLOPs: 42.34 | +[default7]: iteration 4655/ 6200 | consumed samples: 4766720 | consumed tokens: 9762242560 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678114E+00 | loss scale: 2048.0 | grad norm: 5.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.949 | TFLOPs: 42.35 | +[default7]: iteration 4656/ 6200 | consumed samples: 4767744 | consumed tokens: 9764339712 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666919E+00 | loss scale: 2048.0 | grad norm: 5.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.019 | TFLOPs: 42.37 | +[default7]: iteration 4657/ 6200 | consumed samples: 4768768 | consumed tokens: 9766436864 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676364E+00 | loss scale: 2048.0 | grad norm: 5.017 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.393 | TFLOPs: 42.18 | +[default7]: iteration 4658/ 6200 | consumed samples: 4769792 | consumed tokens: 9768534016 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676962E+00 | loss scale: 2048.0 | grad norm: 5.799 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.667 | TFLOPs: 42.27 | +[default7]: iteration 4659/ 6200 | consumed samples: 4770816 | consumed tokens: 9770631168 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687407E+00 | loss scale: 2048.0 | grad norm: 5.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.625 | TFLOPs: 42.25 | +[default7]: iteration 4660/ 6200 | consumed samples: 4771840 | consumed tokens: 9772728320 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653848E+00 | loss scale: 2048.0 | grad norm: 5.646 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.604 | TFLOPs: 42.25 | +[default7]: iteration 4661/ 6200 | consumed samples: 4772864 | consumed tokens: 9774825472 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665437E+00 | loss scale: 2048.0 | grad norm: 5.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.838 | TFLOPs: 42.32 | +[default7]: iteration 4662/ 6200 | consumed samples: 4773888 | consumed tokens: 9776922624 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688157E+00 | loss scale: 2048.0 | grad norm: 5.995 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.851 | TFLOPs: 42.32 | +[default7]: iteration 4663/ 6200 | consumed samples: 4774912 | consumed tokens: 9779019776 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662607E+00 | loss scale: 2048.0 | grad norm: 5.676 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.108 | TFLOPs: 42.40 | +[default7]: iteration 4664/ 6200 | consumed samples: 4775936 | consumed tokens: 9781116928 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673660E+00 | loss scale: 2048.0 | grad norm: 6.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.999 | TFLOPs: 42.37 | +[default7]: iteration 4665/ 6200 | consumed samples: 4776960 | consumed tokens: 9783214080 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658453E+00 | loss scale: 2048.0 | grad norm: 4.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.089 | TFLOPs: 42.40 | +[default7]: iteration 4666/ 6200 | consumed samples: 4777984 | consumed tokens: 9785311232 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664139E+00 | loss scale: 2048.0 | grad norm: 5.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.902 | TFLOPs: 42.34 | +[default7]: iteration 4667/ 6200 | consumed samples: 4779008 | consumed tokens: 9787408384 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627836E+00 | loss scale: 2048.0 | grad norm: 5.715 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.088 | TFLOPs: 42.40 | +[default7]: iteration 4668/ 6200 | consumed samples: 4780032 | consumed tokens: 9789505536 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657395E+00 | loss scale: 2048.0 | grad norm: 5.922 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.090 | TFLOPs: 42.40 | +[default7]: iteration 4669/ 6200 | consumed samples: 4781056 | consumed tokens: 9791602688 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654012E+00 | loss scale: 2048.0 | grad norm: 4.785 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.033 | TFLOPs: 42.38 | +[default7]: iteration 4670/ 6200 | consumed samples: 4782080 | consumed tokens: 9793699840 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688552E+00 | loss scale: 2048.0 | grad norm: 5.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.139 | TFLOPs: 42.41 | +[default7]: iteration 4671/ 6200 | consumed samples: 4783104 | consumed tokens: 9795796992 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672748E+00 | loss scale: 2048.0 | grad norm: 5.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.095 | TFLOPs: 42.40 | +[default7]: iteration 4672/ 6200 | consumed samples: 4784128 | consumed tokens: 9797894144 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657125E+00 | loss scale: 2048.0 | grad norm: 5.041 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.186 | TFLOPs: 42.42 | +[default7]: iteration 4673/ 6200 | consumed samples: 4785152 | consumed tokens: 9799991296 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666935E+00 | loss scale: 2048.0 | grad norm: 4.918 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.078 | TFLOPs: 42.39 | +[default7]: iteration 4674/ 6200 | consumed samples: 4786176 | consumed tokens: 9802088448 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672305E+00 | loss scale: 2048.0 | grad norm: 4.919 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.910 | TFLOPs: 42.34 | +[default7]: iteration 4675/ 6200 | consumed samples: 4787200 | consumed tokens: 9804185600 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682348E+00 | loss scale: 2048.0 | grad norm: 5.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.847 | TFLOPs: 42.32 | +[default7]: iteration 4676/ 6200 | consumed samples: 4788224 | consumed tokens: 9806282752 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636951E+00 | loss scale: 2048.0 | grad norm: 4.946 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.873 | TFLOPs: 42.33 | +[default7]: iteration 4677/ 6200 | consumed samples: 4789248 | consumed tokens: 9808379904 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686509E+00 | loss scale: 2048.0 | grad norm: 5.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.863 | TFLOPs: 42.33 | +[default7]: iteration 4678/ 6200 | consumed samples: 4790272 | consumed tokens: 9810477056 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667886E+00 | loss scale: 2048.0 | grad norm: 5.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.435 | TFLOPs: 42.20 | +[default7]: iteration 4679/ 6200 | consumed samples: 4791296 | consumed tokens: 9812574208 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682222E+00 | loss scale: 2048.0 | grad norm: 6.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.655 | TFLOPs: 42.26 | +[default7]: iteration 4680/ 6200 | consumed samples: 4792320 | consumed tokens: 9814671360 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669666E+00 | loss scale: 2048.0 | grad norm: 5.612 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.541 | TFLOPs: 42.23 | +[default7]: iteration 4681/ 6200 | consumed samples: 4793344 | consumed tokens: 9816768512 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656300E+00 | loss scale: 2048.0 | grad norm: 5.043 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.623 | TFLOPs: 42.25 | +[default7]: iteration 4682/ 6200 | consumed samples: 4794368 | consumed tokens: 9818865664 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645804E+00 | loss scale: 2048.0 | grad norm: 5.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.006 | TFLOPs: 42.37 | +[default7]: iteration 4683/ 6200 | consumed samples: 4795392 | consumed tokens: 9820962816 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676320E+00 | loss scale: 2048.0 | grad norm: 5.761 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.030 | TFLOPs: 42.38 | +[default7]: iteration 4684/ 6200 | consumed samples: 4796416 | consumed tokens: 9823059968 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693340E+00 | loss scale: 2048.0 | grad norm: 5.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.729 | TFLOPs: 42.29 | +[default7]: iteration 4685/ 6200 | consumed samples: 4797440 | consumed tokens: 9825157120 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686742E+00 | loss scale: 2048.0 | grad norm: 5.618 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.721 | TFLOPs: 42.28 | +[default7]: iteration 4686/ 6200 | consumed samples: 4798464 | consumed tokens: 9827254272 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.709756E+00 | loss scale: 2048.0 | grad norm: 5.889 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.902 | TFLOPs: 42.34 | +[default7]: iteration 4687/ 6200 | consumed samples: 4799488 | consumed tokens: 9829351424 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648015E+00 | loss scale: 2048.0 | grad norm: 4.923 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.612 | TFLOPs: 42.25 | +[default7]: iteration 4688/ 6200 | consumed samples: 4800512 | consumed tokens: 9831448576 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666191E+00 | loss scale: 2048.0 | grad norm: 5.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.459 | TFLOPs: 42.20 | +[default7]: iteration 4689/ 6200 | consumed samples: 4801536 | consumed tokens: 9833545728 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674894E+00 | loss scale: 2048.0 | grad norm: 5.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.074 | TFLOPs: 42.39 | +[default7]: iteration 4690/ 6200 | consumed samples: 4802560 | consumed tokens: 9835642880 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660787E+00 | loss scale: 2048.0 | grad norm: 5.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.794 | TFLOPs: 42.31 | +[default7]: iteration 4691/ 6200 | consumed samples: 4803584 | consumed tokens: 9837740032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680150E+00 | loss scale: 2048.0 | grad norm: 5.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.956 | TFLOPs: 42.36 | +[default7]: iteration 4692/ 6200 | consumed samples: 4804608 | consumed tokens: 9839837184 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668795E+00 | loss scale: 2048.0 | grad norm: 5.025 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.915 | TFLOPs: 42.34 | +[default7]: iteration 4693/ 6200 | consumed samples: 4805632 | consumed tokens: 9841934336 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665017E+00 | loss scale: 2048.0 | grad norm: 5.067 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.967 | TFLOPs: 42.36 | +[default7]: iteration 4694/ 6200 | consumed samples: 4806656 | consumed tokens: 9844031488 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681494E+00 | loss scale: 2048.0 | grad norm: 5.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.546 | TFLOPs: 42.23 | +[default7]: iteration 4695/ 6200 | consumed samples: 4807680 | consumed tokens: 9846128640 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679830E+00 | loss scale: 2048.0 | grad norm: 4.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.616 | TFLOPs: 42.25 | +[default7]: iteration 4696/ 6200 | consumed samples: 4808704 | consumed tokens: 9848225792 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670005E+00 | loss scale: 2048.0 | grad norm: 5.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.579 | TFLOPs: 42.24 | +[default7]: iteration 4697/ 6200 | consumed samples: 4809728 | consumed tokens: 9850322944 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657112E+00 | loss scale: 2048.0 | grad norm: 6.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.832 | TFLOPs: 42.32 | +[default7]: iteration 4698/ 6200 | consumed samples: 4810752 | consumed tokens: 9852420096 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649801E+00 | loss scale: 2048.0 | grad norm: 5.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.944 | TFLOPs: 42.35 | +[default7]: iteration 4699/ 6200 | consumed samples: 4811776 | consumed tokens: 9854517248 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667605E+00 | loss scale: 2048.0 | grad norm: 4.979 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.813 | TFLOPs: 42.31 | +[default7]: iteration 4700/ 6200 | consumed samples: 4812800 | consumed tokens: 9856614400 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681124E+00 | loss scale: 2048.0 | grad norm: 5.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.678 | TFLOPs: 42.27 | +[default7]: iteration 4701/ 6200 | consumed samples: 4813824 | consumed tokens: 9858711552 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656203E+00 | loss scale: 2048.0 | grad norm: 5.897 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.843 | TFLOPs: 42.32 | +[default7]: iteration 4702/ 6200 | consumed samples: 4814848 | consumed tokens: 9860808704 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668983E+00 | loss scale: 2048.0 | grad norm: 5.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.922 | TFLOPs: 42.34 | +[default7]: iteration 4703/ 6200 | consumed samples: 4815872 | consumed tokens: 9862905856 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665200E+00 | loss scale: 2048.0 | grad norm: 5.765 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.925 | TFLOPs: 42.35 | +[default7]: iteration 4704/ 6200 | consumed samples: 4816896 | consumed tokens: 9865003008 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646860E+00 | loss scale: 2048.0 | grad norm: 4.742 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.096 | TFLOPs: 42.40 | +[default7]: iteration 4705/ 6200 | consumed samples: 4817920 | consumed tokens: 9867100160 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670280E+00 | loss scale: 2048.0 | grad norm: 5.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.077 | TFLOPs: 42.39 | +[default7]: iteration 4706/ 6200 | consumed samples: 4818944 | consumed tokens: 9869197312 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693526E+00 | loss scale: 4096.0 | grad norm: 2.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.892 | TFLOPs: 42.34 | +[default7]: iteration 4707/ 6200 | consumed samples: 4819968 | consumed tokens: 9871294464 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683122E+00 | loss scale: 4096.0 | grad norm: 5.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.037 | TFLOPs: 42.38 | +[default7]: iteration 4708/ 6200 | consumed samples: 4820992 | consumed tokens: 9873391616 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660289E+00 | loss scale: 4096.0 | grad norm: 5.946 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.062 | TFLOPs: 42.39 | +[default7]: iteration 4709/ 6200 | consumed samples: 4822016 | consumed tokens: 9875488768 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640487E+00 | loss scale: 4096.0 | grad norm: 4.977 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.970 | TFLOPs: 42.36 | +[default7]: iteration 4710/ 6200 | consumed samples: 4823040 | consumed tokens: 9877585920 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678736E+00 | loss scale: 4096.0 | grad norm: 4.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.908 | TFLOPs: 42.34 | +[default7]: iteration 4711/ 6200 | consumed samples: 4824064 | consumed tokens: 9879683072 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667243E+00 | loss scale: 4096.0 | grad norm: 6.034 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.036 | TFLOPs: 42.38 | +[default7]: iteration 4712/ 6200 | consumed samples: 4825088 | consumed tokens: 9881780224 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672941E+00 | loss scale: 4096.0 | grad norm: 6.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.949 | TFLOPs: 42.35 | +[default7]: iteration 4713/ 6200 | consumed samples: 4826112 | consumed tokens: 9883877376 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639998E+00 | loss scale: 4096.0 | grad norm: 4.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.014 | TFLOPs: 42.37 | +[default7]: iteration 4714/ 6200 | consumed samples: 4827136 | consumed tokens: 9885974528 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651199E+00 | loss scale: 4096.0 | grad norm: 4.924 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.938 | TFLOPs: 42.35 | +[default7]: iteration 4715/ 6200 | consumed samples: 4828160 | consumed tokens: 9888071680 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643741E+00 | loss scale: 4096.0 | grad norm: 6.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.004 | TFLOPs: 42.37 | +[default7]: iteration 4716/ 6200 | consumed samples: 4829184 | consumed tokens: 9890168832 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.684801E+00 | loss scale: 4096.0 | grad norm: 5.635 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.974 | TFLOPs: 42.36 | +[default7]: iteration 4717/ 6200 | consumed samples: 4830208 | consumed tokens: 9892265984 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664608E+00 | loss scale: 4096.0 | grad norm: 4.940 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.810 | TFLOPs: 42.31 | +[default7]: iteration 4718/ 6200 | consumed samples: 4831232 | consumed tokens: 9894363136 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635729E+00 | loss scale: 4096.0 | grad norm: 5.846 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.924 | TFLOPs: 42.35 | +[default7]: iteration 4719/ 6200 | consumed samples: 4832256 | consumed tokens: 9896460288 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644753E+00 | loss scale: 4096.0 | grad norm: 4.829 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.898 | TFLOPs: 42.34 | +[default7]: iteration 4720/ 6200 | consumed samples: 4833280 | consumed tokens: 9898557440 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662336E+00 | loss scale: 4096.0 | grad norm: 5.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.920 | TFLOPs: 42.34 | +[default7]: iteration 4721/ 6200 | consumed samples: 4834304 | consumed tokens: 9900654592 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668311E+00 | loss scale: 4096.0 | grad norm: 6.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.001 | TFLOPs: 42.37 | +[default7]: iteration 4722/ 6200 | consumed samples: 4835328 | consumed tokens: 9902751744 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674906E+00 | loss scale: 4096.0 | grad norm: 4.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.059 | TFLOPs: 42.39 | +[default7]: iteration 4723/ 6200 | consumed samples: 4836352 | consumed tokens: 9904848896 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648349E+00 | loss scale: 4096.0 | grad norm: 6.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.101 | TFLOPs: 42.40 | +[default7]: iteration 4724/ 6200 | consumed samples: 4837376 | consumed tokens: 9906946048 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.705124E+00 | loss scale: 4096.0 | grad norm: 6.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.042 | TFLOPs: 42.38 | +[default7]: iteration 4725/ 6200 | consumed samples: 4838400 | consumed tokens: 9909043200 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673065E+00 | loss scale: 4096.0 | grad norm: 6.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.946 | TFLOPs: 42.35 | +[default7]: iteration 4726/ 6200 | consumed samples: 4839424 | consumed tokens: 9911140352 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683279E+00 | loss scale: 4096.0 | grad norm: 5.771 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.154 | TFLOPs: 42.42 | +[default7]: iteration 4727/ 6200 | consumed samples: 4840448 | consumed tokens: 9913237504 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658836E+00 | loss scale: 4096.0 | grad norm: 7.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.199 | TFLOPs: 42.43 | +[default7]: iteration 4728/ 6200 | consumed samples: 4841472 | consumed tokens: 9915334656 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.701910E+00 | loss scale: 4096.0 | grad norm: 5.924 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.958 | TFLOPs: 42.36 | +[default7]: iteration 4729/ 6200 | consumed samples: 4842496 | consumed tokens: 9917431808 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679605E+00 | loss scale: 4096.0 | grad norm: 4.887 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.950 | TFLOPs: 42.35 | +[default7]: iteration 4730/ 6200 | consumed samples: 4843520 | consumed tokens: 9919528960 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666471E+00 | loss scale: 4096.0 | grad norm: 5.771 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.087 | TFLOPs: 42.39 | +[default7]: iteration 4731/ 6200 | consumed samples: 4844544 | consumed tokens: 9921626112 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656663E+00 | loss scale: 4096.0 | grad norm: 5.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.994 | TFLOPs: 42.37 | +[default7]: iteration 4732/ 6200 | consumed samples: 4845568 | consumed tokens: 9923723264 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667974E+00 | loss scale: 4096.0 | grad norm: 6.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.950 | TFLOPs: 42.35 | +[default7]: iteration 4733/ 6200 | consumed samples: 4846592 | consumed tokens: 9925820416 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.655341E+00 | loss scale: 4096.0 | grad norm: 5.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.065 | TFLOPs: 42.39 | +[default7]: iteration 4734/ 6200 | consumed samples: 4847616 | consumed tokens: 9927917568 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686029E+00 | loss scale: 4096.0 | grad norm: 5.074 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.281 | TFLOPs: 42.45 | +[default7]: iteration 4735/ 6200 | consumed samples: 4848640 | consumed tokens: 9930014720 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637470E+00 | loss scale: 4096.0 | grad norm: 5.036 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.132 | TFLOPs: 42.41 | +[default7]: iteration 4736/ 6200 | consumed samples: 4849664 | consumed tokens: 9932111872 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674360E+00 | loss scale: 4096.0 | grad norm: 5.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.193 | TFLOPs: 42.43 | +[default7]: iteration 4737/ 6200 | consumed samples: 4850688 | consumed tokens: 9934209024 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661576E+00 | loss scale: 4096.0 | grad norm: 5.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.967 | TFLOPs: 42.36 | +[default7]: iteration 4738/ 6200 | consumed samples: 4851712 | consumed tokens: 9936306176 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671280E+00 | loss scale: 4096.0 | grad norm: 5.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.150 | TFLOPs: 42.41 | +[default7]: iteration 4739/ 6200 | consumed samples: 4852736 | consumed tokens: 9938403328 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652741E+00 | loss scale: 4096.0 | grad norm: 5.837 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.252 | TFLOPs: 42.45 | +[default7]: iteration 4740/ 6200 | consumed samples: 4853760 | consumed tokens: 9940500480 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647455E+00 | loss scale: 4096.0 | grad norm: 5.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.256 | TFLOPs: 42.45 | +[default7]: iteration 4741/ 6200 | consumed samples: 4854784 | consumed tokens: 9942597632 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653637E+00 | loss scale: 4096.0 | grad norm: 6.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.117 | TFLOPs: 42.40 | +[default7]: iteration 4742/ 6200 | consumed samples: 4855808 | consumed tokens: 9944694784 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654976E+00 | loss scale: 4096.0 | grad norm: 6.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.058 | TFLOPs: 42.39 | +[default7]: iteration 4743/ 6200 | consumed samples: 4856832 | consumed tokens: 9946791936 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669371E+00 | loss scale: 4096.0 | grad norm: 6.042 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.082 | TFLOPs: 42.39 | +[default7]: iteration 4744/ 6200 | consumed samples: 4857856 | consumed tokens: 9948889088 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656481E+00 | loss scale: 4096.0 | grad norm: 4.863 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.244 | TFLOPs: 42.44 | +[default7]: iteration 4745/ 6200 | consumed samples: 4858880 | consumed tokens: 9950986240 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698044E+00 | loss scale: 4096.0 | grad norm: 5.571 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.093 | TFLOPs: 42.40 | +[default7]: iteration 4746/ 6200 | consumed samples: 4859904 | consumed tokens: 9953083392 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652979E+00 | loss scale: 4096.0 | grad norm: 5.758 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.025 | TFLOPs: 42.38 | +[default7]: iteration 4747/ 6200 | consumed samples: 4860928 | consumed tokens: 9955180544 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663784E+00 | loss scale: 4096.0 | grad norm: 5.861 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.840 | TFLOPs: 42.32 | +[default7]: iteration 4748/ 6200 | consumed samples: 4861952 | consumed tokens: 9957277696 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656476E+00 | loss scale: 4096.0 | grad norm: 4.862 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.231 | TFLOPs: 42.44 | +[default7]: iteration 4749/ 6200 | consumed samples: 4862976 | consumed tokens: 9959374848 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.696282E+00 | loss scale: 4096.0 | grad norm: 5.903 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.300 | TFLOPs: 42.46 | +[default7]: iteration 4750/ 6200 | consumed samples: 4864000 | consumed tokens: 9961472000 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677007E+00 | loss scale: 4096.0 | grad norm: 7.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.904 | TFLOPs: 42.34 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 4750 | lm loss value: 3.626354E+00 | lm loss PPL: 3.757557E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 4750 | lm loss value: 1.558914E+00 | lm loss PPL: 4.753656E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 4750 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 22:56:40,907] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4750 is begin to save! +[default0]:[2022-10-06 22:56:40,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,313] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,342] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,369] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,422] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,474] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,528] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,554] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,607] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,740] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,767] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,794] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,821] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,901] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_25-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_26-model_00-model_states.pt... +[default1]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 22:56:41,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,954] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 22:56:41,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 22:56:41,956] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/mp_rank_00_model_states.pt +[default4]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 22:56:41,956] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 22:56:41,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 22:56:41,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 22:56:42,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 22:56:42,154] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 22:56:42,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 22:56:42,177] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 22:56:42,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 22:56:42,175] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 22:56:42,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 22:56:42,177] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 22:56:42,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 22:56:42,187] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 22:56:42,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 22:56:42,250] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 22:56:42,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 22:56:42,187] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 22:56:42,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 22:56:42,256] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 22:56:42,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 22:56:42,257] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 22:56:42,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 22:56:42,203] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 22:56:42,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 22:56:42,210] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 22:56:42,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 22:56:42,284] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 22:56:42,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 22:56:42,259] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 22:56:42,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 22:56:42,271] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 22:56:42,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 22:56:42,268] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 22:56:42,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 22:56:42,269] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 22:56:42,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 22:56:42,281] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 22:56:42,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 22:56:42,275] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 22:56:42,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 22:56:42,276] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 22:56:42,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 22:56:42,285] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 22:56:42,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 22:56:42,277] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 22:56:42,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 22:56:42,246] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 22:56:42,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 22:56:42,289] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 22:56:42,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 22:56:42,270] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 22:56:42,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 22:56:42,340] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 22:56:42,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 22:56:42,271] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 22:56:42,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 22:56:42,284] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 22:56:42,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 22:56:42,283] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 22:56:42,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 22:56:42,316] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 22:56:42,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 22:56:42,389] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default3]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default4]:[2022-10-06 22:56:42,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 22:56:42,436] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default0]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default0]: successfully saved checkpoint at iteration 4750 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default5]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default2]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default2]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default1]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default7]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default0]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default3]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default7]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default4]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default1]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default2]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default5]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default1]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default0]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default4]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default6]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default6]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default0]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default6]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default7]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default7]:time (ms) | save-checkpoint: 1530.05 +[default5]:[2022-10-06 22:56:42,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 22:56:42,431] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step4750/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default3]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default1]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default3]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default6]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default5]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default4]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default2]:[2022-10-06 22:56:42,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4750 is ready now! +[default7]: iteration 4751/ 6200 | consumed samples: 4865024 | consumed tokens: 9963569152 | elapsed time per iteration (s): 53.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665390E+00 | loss scale: 4096.0 | grad norm: 5.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.183 | TFLOPs: 5.85 | +[default7]: iteration 4752/ 6200 | consumed samples: 4866048 | consumed tokens: 9965666304 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660560E+00 | loss scale: 4096.0 | grad norm: 7.058 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.906 | TFLOPs: 42.34 | +[default7]: iteration 4753/ 6200 | consumed samples: 4867072 | consumed tokens: 9967763456 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681983E+00 | loss scale: 4096.0 | grad norm: 6.776 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.854 | TFLOPs: 42.32 | +[default7]: iteration 4754/ 6200 | consumed samples: 4868096 | consumed tokens: 9969860608 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642407E+00 | loss scale: 4096.0 | grad norm: 5.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.870 | TFLOPs: 42.33 | +[default7]: iteration 4755/ 6200 | consumed samples: 4869120 | consumed tokens: 9971957760 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676298E+00 | loss scale: 4096.0 | grad norm: 5.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.957 | TFLOPs: 42.36 | +[default7]: iteration 4756/ 6200 | consumed samples: 4870144 | consumed tokens: 9974054912 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656712E+00 | loss scale: 4096.0 | grad norm: 5.910 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.976 | TFLOPs: 42.36 | +[default7]: iteration 4757/ 6200 | consumed samples: 4871168 | consumed tokens: 9976152064 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647768E+00 | loss scale: 4096.0 | grad norm: 5.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.732 | TFLOPs: 42.29 | +[default7]: iteration 4758/ 6200 | consumed samples: 4872192 | consumed tokens: 9978249216 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646880E+00 | loss scale: 4096.0 | grad norm: 6.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.878 | TFLOPs: 42.33 | +[default7]: iteration 4759/ 6200 | consumed samples: 4873216 | consumed tokens: 9980346368 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667758E+00 | loss scale: 4096.0 | grad norm: 6.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.706 | TFLOPs: 42.28 | +[default7]: iteration 4760/ 6200 | consumed samples: 4874240 | consumed tokens: 9982443520 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661086E+00 | loss scale: 4096.0 | grad norm: 5.917 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.393 | TFLOPs: 42.49 | +[default7]: iteration 4761/ 6200 | consumed samples: 4875264 | consumed tokens: 9984540672 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681113E+00 | loss scale: 4096.0 | grad norm: 5.966 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.490 | TFLOPs: 42.52 | +[default7]: iteration 4762/ 6200 | consumed samples: 4876288 | consumed tokens: 9986637824 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679607E+00 | loss scale: 4096.0 | grad norm: 6.772 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.232 | TFLOPs: 42.44 | +[default7]: iteration 4763/ 6200 | consumed samples: 4877312 | consumed tokens: 9988734976 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637599E+00 | loss scale: 4096.0 | grad norm: 6.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.381 | TFLOPs: 42.48 | +[default7]: iteration 4764/ 6200 | consumed samples: 4878336 | consumed tokens: 9990832128 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673086E+00 | loss scale: 4096.0 | grad norm: 5.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.336 | TFLOPs: 42.47 | +[default7]: iteration 4765/ 6200 | consumed samples: 4879360 | consumed tokens: 9992929280 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681473E+00 | loss scale: 4096.0 | grad norm: 5.828 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.354 | TFLOPs: 42.48 | +[default7]: iteration 4766/ 6200 | consumed samples: 4880384 | consumed tokens: 9995026432 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645267E+00 | loss scale: 4096.0 | grad norm: 6.830 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.343 | TFLOPs: 42.47 | +[default7]: iteration 4767/ 6200 | consumed samples: 4881408 | consumed tokens: 9997123584 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689481E+00 | loss scale: 4096.0 | grad norm: 5.963 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.177 | TFLOPs: 42.42 | +[default7]: iteration 4768/ 6200 | consumed samples: 4882432 | consumed tokens: 9999220736 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669846E+00 | loss scale: 4096.0 | grad norm: 5.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.467 | TFLOPs: 42.51 | +[default7]: iteration 4769/ 6200 | consumed samples: 4883456 | consumed tokens: 10001317888 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665992E+00 | loss scale: 4096.0 | grad norm: 6.971 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.291 | TFLOPs: 42.46 | +[default7]: iteration 4770/ 6200 | consumed samples: 4884480 | consumed tokens: 10003415040 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.711338E+00 | loss scale: 4096.0 | grad norm: 7.644 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.234 | TFLOPs: 42.44 | +[default7]: iteration 4771/ 6200 | consumed samples: 4885504 | consumed tokens: 10005512192 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651557E+00 | loss scale: 4096.0 | grad norm: 4.839 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.417 | TFLOPs: 42.50 | +[default7]: iteration 4772/ 6200 | consumed samples: 4886528 | consumed tokens: 10007609344 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637464E+00 | loss scale: 4096.0 | grad norm: 5.655 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.490 | TFLOPs: 42.52 | +[default7]: iteration 4773/ 6200 | consumed samples: 4887552 | consumed tokens: 10009706496 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.655596E+00 | loss scale: 4096.0 | grad norm: 6.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.962 | TFLOPs: 42.36 | +[default7]: iteration 4774/ 6200 | consumed samples: 4888576 | consumed tokens: 10011803648 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.655804E+00 | loss scale: 4096.0 | grad norm: 5.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.882 | TFLOPs: 42.33 | +[default7]: iteration 4775/ 6200 | consumed samples: 4889600 | consumed tokens: 10013900800 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648802E+00 | loss scale: 4096.0 | grad norm: 5.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.962 | TFLOPs: 42.36 | +[default7]: iteration 4776/ 6200 | consumed samples: 4890624 | consumed tokens: 10015997952 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681254E+00 | loss scale: 4096.0 | grad norm: 4.899 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.604 | TFLOPs: 42.25 | +[default7]: iteration 4777/ 6200 | consumed samples: 4891648 | consumed tokens: 10018095104 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638351E+00 | loss scale: 4096.0 | grad norm: 4.740 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.003 | TFLOPs: 42.37 | +[default7]: iteration 4778/ 6200 | consumed samples: 4892672 | consumed tokens: 10020192256 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676085E+00 | loss scale: 4096.0 | grad norm: 5.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.062 | TFLOPs: 42.39 | +[default7]: iteration 4779/ 6200 | consumed samples: 4893696 | consumed tokens: 10022289408 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679073E+00 | loss scale: 4096.0 | grad norm: 5.830 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.890 | TFLOPs: 42.33 | +[default7]: iteration 4780/ 6200 | consumed samples: 4894720 | consumed tokens: 10024386560 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624320E+00 | loss scale: 4096.0 | grad norm: 4.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.820 | TFLOPs: 42.31 | +[default0]:[2022-10-06 23:00:30,579] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096.0, reducing to 4096.0 +[default7]: iteration 4781/ 6200 | consumed samples: 4895744 | consumed tokens: 10026483712 | elapsed time per iteration (s): 7.27 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663772E+00 | loss scale: 4096.0 | grad norm: 4.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 140.801 | TFLOPs: 42.92 | +[default7]: iteration 4782/ 6200 | consumed samples: 4896768 | consumed tokens: 10028580864 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654585E+00 | loss scale: 4096.0 | grad norm: 4.832 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.833 | TFLOPs: 42.32 | +[default7]: iteration 4783/ 6200 | consumed samples: 4897792 | consumed tokens: 10030678016 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645745E+00 | loss scale: 4096.0 | grad norm: 4.909 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.743 | TFLOPs: 42.29 | +[default7]: iteration 4784/ 6200 | consumed samples: 4898816 | consumed tokens: 10032775168 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676148E+00 | loss scale: 4096.0 | grad norm: 5.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.698 | TFLOPs: 42.28 | +[default7]: iteration 4785/ 6200 | consumed samples: 4899840 | consumed tokens: 10034872320 | elapsed time per iteration (s): 7.28 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667580E+00 | loss scale: 2048.0 | grad norm: 5.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 140.717 | TFLOPs: 42.89 | +[default0]:[2022-10-06 23:01:00,000] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[default7]: iteration 4786/ 6200 | consumed samples: 4900864 | consumed tokens: 10036969472 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670167E+00 | loss scale: 2048.0 | grad norm: 5.790 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.556 | TFLOPs: 42.23 | +[default7]: iteration 4787/ 6200 | consumed samples: 4901888 | consumed tokens: 10039066624 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649671E+00 | loss scale: 2048.0 | grad norm: 4.947 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.579 | TFLOPs: 42.24 | +[default7]: iteration 4788/ 6200 | consumed samples: 4902912 | consumed tokens: 10041163776 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647768E+00 | loss scale: 2048.0 | grad norm: 7.845 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.807 | TFLOPs: 42.31 | +[default7]: iteration 4789/ 6200 | consumed samples: 4903936 | consumed tokens: 10043260928 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661651E+00 | loss scale: 2048.0 | grad norm: 5.807 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.477 | TFLOPs: 42.21 | +[default7]: iteration 4790/ 6200 | consumed samples: 4904960 | consumed tokens: 10045358080 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664726E+00 | loss scale: 2048.0 | grad norm: 5.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.614 | TFLOPs: 42.25 | +[default7]: iteration 4791/ 6200 | consumed samples: 4905984 | consumed tokens: 10047455232 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661543E+00 | loss scale: 2048.0 | grad norm: 5.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.353 | TFLOPs: 42.17 | +[default7]: iteration 4792/ 6200 | consumed samples: 4907008 | consumed tokens: 10049552384 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646366E+00 | loss scale: 2048.0 | grad norm: 6.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.542 | TFLOPs: 42.23 | +[default7]: iteration 4793/ 6200 | consumed samples: 4908032 | consumed tokens: 10051649536 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674745E+00 | loss scale: 2048.0 | grad norm: 6.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.188 | TFLOPs: 42.12 | +[default7]: iteration 4794/ 6200 | consumed samples: 4909056 | consumed tokens: 10053746688 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675767E+00 | loss scale: 2048.0 | grad norm: 6.004 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.697 | TFLOPs: 42.28 | +[default7]: iteration 4795/ 6200 | consumed samples: 4910080 | consumed tokens: 10055843840 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680840E+00 | loss scale: 2048.0 | grad norm: 5.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.692 | TFLOPs: 42.27 | +[default7]: iteration 4796/ 6200 | consumed samples: 4911104 | consumed tokens: 10057940992 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659523E+00 | loss scale: 2048.0 | grad norm: 5.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.582 | TFLOPs: 42.24 | +[default7]: iteration 4797/ 6200 | consumed samples: 4912128 | consumed tokens: 10060038144 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666910E+00 | loss scale: 2048.0 | grad norm: 5.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.739 | TFLOPs: 42.29 | +[default7]: iteration 4798/ 6200 | consumed samples: 4913152 | consumed tokens: 10062135296 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667869E+00 | loss scale: 2048.0 | grad norm: 5.725 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.934 | TFLOPs: 42.35 | +[default7]: iteration 4799/ 6200 | consumed samples: 4914176 | consumed tokens: 10064232448 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668890E+00 | loss scale: 2048.0 | grad norm: 4.874 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.762 | TFLOPs: 42.30 | +[default7]: iteration 4800/ 6200 | consumed samples: 4915200 | consumed tokens: 10066329600 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680007E+00 | loss scale: 2048.0 | grad norm: 5.632 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.986 | TFLOPs: 42.36 | +[default7]: iteration 4801/ 6200 | consumed samples: 4916224 | consumed tokens: 10068426752 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699565E+00 | loss scale: 2048.0 | grad norm: 6.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.854 | TFLOPs: 42.32 | +[default7]: iteration 4802/ 6200 | consumed samples: 4917248 | consumed tokens: 10070523904 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639754E+00 | loss scale: 2048.0 | grad norm: 4.968 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.053 | TFLOPs: 42.38 | +[default7]: iteration 4803/ 6200 | consumed samples: 4918272 | consumed tokens: 10072621056 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679000E+00 | loss scale: 2048.0 | grad norm: 5.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.510 | TFLOPs: 42.22 | +[default7]: iteration 4804/ 6200 | consumed samples: 4919296 | consumed tokens: 10074718208 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.689614E+00 | loss scale: 2048.0 | grad norm: 5.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.712 | TFLOPs: 42.28 | +[default7]: iteration 4805/ 6200 | consumed samples: 4920320 | consumed tokens: 10076815360 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669515E+00 | loss scale: 2048.0 | grad norm: 4.935 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.466 | TFLOPs: 42.21 | +[default7]: iteration 4806/ 6200 | consumed samples: 4921344 | consumed tokens: 10078912512 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658772E+00 | loss scale: 2048.0 | grad norm: 6.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.518 | TFLOPs: 42.22 | +[default7]: iteration 4807/ 6200 | consumed samples: 4922368 | consumed tokens: 10081009664 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676565E+00 | loss scale: 2048.0 | grad norm: 5.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.603 | TFLOPs: 42.25 | +[default7]: iteration 4808/ 6200 | consumed samples: 4923392 | consumed tokens: 10083106816 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656415E+00 | loss scale: 2048.0 | grad norm: 4.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.864 | TFLOPs: 42.33 | +[default7]: iteration 4809/ 6200 | consumed samples: 4924416 | consumed tokens: 10085203968 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674673E+00 | loss scale: 2048.0 | grad norm: 6.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.810 | TFLOPs: 42.31 | +[default7]: iteration 4810/ 6200 | consumed samples: 4925440 | consumed tokens: 10087301120 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663772E+00 | loss scale: 2048.0 | grad norm: 4.880 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.702 | TFLOPs: 42.28 | +[default7]: iteration 4811/ 6200 | consumed samples: 4926464 | consumed tokens: 10089398272 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671437E+00 | loss scale: 2048.0 | grad norm: 6.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.517 | TFLOPs: 42.22 | +[default7]: iteration 4812/ 6200 | consumed samples: 4927488 | consumed tokens: 10091495424 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654588E+00 | loss scale: 2048.0 | grad norm: 5.678 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.756 | TFLOPs: 42.29 | +[default7]: iteration 4813/ 6200 | consumed samples: 4928512 | consumed tokens: 10093592576 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647026E+00 | loss scale: 2048.0 | grad norm: 5.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.675 | TFLOPs: 42.27 | +[default7]: iteration 4814/ 6200 | consumed samples: 4929536 | consumed tokens: 10095689728 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651377E+00 | loss scale: 2048.0 | grad norm: 6.076 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.788 | TFLOPs: 42.30 | +[default7]: iteration 4815/ 6200 | consumed samples: 4930560 | consumed tokens: 10097786880 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.697921E+00 | loss scale: 2048.0 | grad norm: 6.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.618 | TFLOPs: 42.25 | +[default7]: iteration 4816/ 6200 | consumed samples: 4931584 | consumed tokens: 10099884032 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663635E+00 | loss scale: 2048.0 | grad norm: 7.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.661 | TFLOPs: 42.27 | +[default7]: iteration 4817/ 6200 | consumed samples: 4932608 | consumed tokens: 10101981184 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654396E+00 | loss scale: 2048.0 | grad norm: 4.975 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.514 | TFLOPs: 42.22 | +[default7]: iteration 4818/ 6200 | consumed samples: 4933632 | consumed tokens: 10104078336 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669166E+00 | loss scale: 2048.0 | grad norm: 4.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.088 | TFLOPs: 42.40 | +[default7]: iteration 4819/ 6200 | consumed samples: 4934656 | consumed tokens: 10106175488 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.606444E+00 | loss scale: 2048.0 | grad norm: 4.977 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.012 | TFLOPs: 42.37 | +[default7]: iteration 4820/ 6200 | consumed samples: 4935680 | consumed tokens: 10108272640 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632303E+00 | loss scale: 2048.0 | grad norm: 4.994 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.913 | TFLOPs: 42.34 | +[default7]: iteration 4821/ 6200 | consumed samples: 4936704 | consumed tokens: 10110369792 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646965E+00 | loss scale: 2048.0 | grad norm: 5.865 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.867 | TFLOPs: 42.33 | +[default7]: iteration 4822/ 6200 | consumed samples: 4937728 | consumed tokens: 10112466944 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.708724E+00 | loss scale: 2048.0 | grad norm: 5.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.880 | TFLOPs: 42.33 | +[default7]: iteration 4823/ 6200 | consumed samples: 4938752 | consumed tokens: 10114564096 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670562E+00 | loss scale: 2048.0 | grad norm: 5.993 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.937 | TFLOPs: 42.35 | +[default7]: iteration 4824/ 6200 | consumed samples: 4939776 | consumed tokens: 10116661248 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674532E+00 | loss scale: 2048.0 | grad norm: 5.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.916 | TFLOPs: 42.34 | +[default7]: iteration 4825/ 6200 | consumed samples: 4940800 | consumed tokens: 10118758400 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678371E+00 | loss scale: 2048.0 | grad norm: 4.885 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.847 | TFLOPs: 42.32 | +[default7]: iteration 4826/ 6200 | consumed samples: 4941824 | consumed tokens: 10120855552 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670673E+00 | loss scale: 2048.0 | grad norm: 5.926 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.904 | TFLOPs: 42.34 | +[default7]: iteration 4827/ 6200 | consumed samples: 4942848 | consumed tokens: 10122952704 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661185E+00 | loss scale: 2048.0 | grad norm: 4.928 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.785 | TFLOPs: 42.30 | +[default7]: iteration 4828/ 6200 | consumed samples: 4943872 | consumed tokens: 10125049856 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664223E+00 | loss scale: 2048.0 | grad norm: 4.778 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.077 | TFLOPs: 42.39 | +[default7]: iteration 4829/ 6200 | consumed samples: 4944896 | consumed tokens: 10127147008 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652630E+00 | loss scale: 2048.0 | grad norm: 6.001 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.754 | TFLOPs: 42.29 | +[default7]: iteration 4830/ 6200 | consumed samples: 4945920 | consumed tokens: 10129244160 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654359E+00 | loss scale: 2048.0 | grad norm: 5.875 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.847 | TFLOPs: 42.32 | +[default7]: iteration 4831/ 6200 | consumed samples: 4946944 | consumed tokens: 10131341312 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653728E+00 | loss scale: 2048.0 | grad norm: 6.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.104 | TFLOPs: 42.40 | +[default7]: iteration 4832/ 6200 | consumed samples: 4947968 | consumed tokens: 10133438464 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669644E+00 | loss scale: 2048.0 | grad norm: 5.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.019 | TFLOPs: 42.37 | +[default7]: iteration 4833/ 6200 | consumed samples: 4948992 | consumed tokens: 10135535616 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645802E+00 | loss scale: 2048.0 | grad norm: 5.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.962 | TFLOPs: 42.36 | +[default7]: iteration 4834/ 6200 | consumed samples: 4950016 | consumed tokens: 10137632768 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660496E+00 | loss scale: 2048.0 | grad norm: 5.717 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.051 | TFLOPs: 42.38 | +[default7]: iteration 4835/ 6200 | consumed samples: 4951040 | consumed tokens: 10139729920 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635523E+00 | loss scale: 2048.0 | grad norm: 5.760 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.897 | TFLOPs: 42.34 | +[default7]: iteration 4836/ 6200 | consumed samples: 4952064 | consumed tokens: 10141827072 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646341E+00 | loss scale: 2048.0 | grad norm: 5.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.944 | TFLOPs: 42.35 | +[default7]: iteration 4837/ 6200 | consumed samples: 4953088 | consumed tokens: 10143924224 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664800E+00 | loss scale: 2048.0 | grad norm: 6.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.055 | TFLOPs: 42.39 | +[default7]: iteration 4838/ 6200 | consumed samples: 4954112 | consumed tokens: 10146021376 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653413E+00 | loss scale: 2048.0 | grad norm: 6.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.048 | TFLOPs: 42.38 | +[default7]: iteration 4839/ 6200 | consumed samples: 4955136 | consumed tokens: 10148118528 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649478E+00 | loss scale: 2048.0 | grad norm: 5.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.997 | TFLOPs: 42.37 | +[default7]: iteration 4840/ 6200 | consumed samples: 4956160 | consumed tokens: 10150215680 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670376E+00 | loss scale: 2048.0 | grad norm: 5.935 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.996 | TFLOPs: 42.37 | +[default7]: iteration 4841/ 6200 | consumed samples: 4957184 | consumed tokens: 10152312832 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667483E+00 | loss scale: 2048.0 | grad norm: 5.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.047 | TFLOPs: 42.38 | +[default7]: iteration 4842/ 6200 | consumed samples: 4958208 | consumed tokens: 10154409984 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652810E+00 | loss scale: 2048.0 | grad norm: 5.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.247 | TFLOPs: 42.44 | +[default7]: iteration 4843/ 6200 | consumed samples: 4959232 | consumed tokens: 10156507136 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666639E+00 | loss scale: 2048.0 | grad norm: 5.585 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.235 | TFLOPs: 42.44 | +[default7]: iteration 4844/ 6200 | consumed samples: 4960256 | consumed tokens: 10158604288 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664743E+00 | loss scale: 2048.0 | grad norm: 7.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.864 | TFLOPs: 42.33 | +[default7]: iteration 4845/ 6200 | consumed samples: 4961280 | consumed tokens: 10160701440 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676267E+00 | loss scale: 2048.0 | grad norm: 8.072 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.328 | TFLOPs: 42.47 | +[default7]: iteration 4846/ 6200 | consumed samples: 4962304 | consumed tokens: 10162798592 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623519E+00 | loss scale: 2048.0 | grad norm: 6.029 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.881 | TFLOPs: 42.33 | +[default7]: iteration 4847/ 6200 | consumed samples: 4963328 | consumed tokens: 10164895744 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652366E+00 | loss scale: 2048.0 | grad norm: 5.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.802 | TFLOPs: 42.31 | +[default7]: iteration 4848/ 6200 | consumed samples: 4964352 | consumed tokens: 10166992896 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.655077E+00 | loss scale: 2048.0 | grad norm: 5.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.052 | TFLOPs: 42.38 | +[default7]: iteration 4849/ 6200 | consumed samples: 4965376 | consumed tokens: 10169090048 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.688490E+00 | loss scale: 2048.0 | grad norm: 6.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.997 | TFLOPs: 42.37 | +[default7]: iteration 4850/ 6200 | consumed samples: 4966400 | consumed tokens: 10171187200 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682196E+00 | loss scale: 2048.0 | grad norm: 6.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.914 | TFLOPs: 42.34 | +[default7]: iteration 4851/ 6200 | consumed samples: 4967424 | consumed tokens: 10173284352 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648765E+00 | loss scale: 2048.0 | grad norm: 5.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.607 | TFLOPs: 42.25 | +[default7]: iteration 4852/ 6200 | consumed samples: 4968448 | consumed tokens: 10175381504 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.695982E+00 | loss scale: 2048.0 | grad norm: 5.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.470 | TFLOPs: 42.21 | +[default7]: iteration 4853/ 6200 | consumed samples: 4969472 | consumed tokens: 10177478656 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683220E+00 | loss scale: 2048.0 | grad norm: 5.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.455 | TFLOPs: 42.20 | +[default7]: iteration 4854/ 6200 | consumed samples: 4970496 | consumed tokens: 10179575808 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622650E+00 | loss scale: 2048.0 | grad norm: 4.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.565 | TFLOPs: 42.24 | +[default7]: iteration 4855/ 6200 | consumed samples: 4971520 | consumed tokens: 10181672960 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650361E+00 | loss scale: 2048.0 | grad norm: 6.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.612 | TFLOPs: 42.25 | +[default7]: iteration 4856/ 6200 | consumed samples: 4972544 | consumed tokens: 10183770112 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652599E+00 | loss scale: 2048.0 | grad norm: 5.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.692 | TFLOPs: 42.27 | +[default7]: iteration 4857/ 6200 | consumed samples: 4973568 | consumed tokens: 10185867264 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623125E+00 | loss scale: 2048.0 | grad norm: 4.787 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.654 | TFLOPs: 42.26 | +[default7]: iteration 4858/ 6200 | consumed samples: 4974592 | consumed tokens: 10187964416 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656196E+00 | loss scale: 2048.0 | grad norm: 4.919 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.700 | TFLOPs: 42.28 | +[default7]: iteration 4859/ 6200 | consumed samples: 4975616 | consumed tokens: 10190061568 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653286E+00 | loss scale: 2048.0 | grad norm: 4.925 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.530 | TFLOPs: 42.23 | +[default7]: iteration 4860/ 6200 | consumed samples: 4976640 | consumed tokens: 10192158720 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665880E+00 | loss scale: 2048.0 | grad norm: 5.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.592 | TFLOPs: 42.24 | +[default7]: iteration 4861/ 6200 | consumed samples: 4977664 | consumed tokens: 10194255872 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658045E+00 | loss scale: 2048.0 | grad norm: 4.929 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.729 | TFLOPs: 42.29 | +[default7]: iteration 4862/ 6200 | consumed samples: 4978688 | consumed tokens: 10196353024 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653225E+00 | loss scale: 2048.0 | grad norm: 4.918 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.613 | TFLOPs: 42.25 | +[default7]: iteration 4863/ 6200 | consumed samples: 4979712 | consumed tokens: 10198450176 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673364E+00 | loss scale: 2048.0 | grad norm: 5.834 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.854 | TFLOPs: 42.32 | +[default7]: iteration 4864/ 6200 | consumed samples: 4980736 | consumed tokens: 10200547328 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640707E+00 | loss scale: 2048.0 | grad norm: 6.829 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.491 | TFLOPs: 42.21 | +[default7]: iteration 4865/ 6200 | consumed samples: 4981760 | consumed tokens: 10202644480 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653048E+00 | loss scale: 2048.0 | grad norm: 5.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.566 | TFLOPs: 42.24 | +[default7]: iteration 4866/ 6200 | consumed samples: 4982784 | consumed tokens: 10204741632 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652216E+00 | loss scale: 2048.0 | grad norm: 5.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.822 | TFLOPs: 42.31 | +[default7]: iteration 4867/ 6200 | consumed samples: 4983808 | consumed tokens: 10206838784 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651632E+00 | loss scale: 2048.0 | grad norm: 8.049 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.607 | TFLOPs: 42.25 | +[default7]: iteration 4868/ 6200 | consumed samples: 4984832 | consumed tokens: 10208935936 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650593E+00 | loss scale: 2048.0 | grad norm: 5.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.489 | TFLOPs: 42.21 | +[default7]: iteration 4869/ 6200 | consumed samples: 4985856 | consumed tokens: 10211033088 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664430E+00 | loss scale: 2048.0 | grad norm: 5.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.698 | TFLOPs: 42.28 | +[default7]: iteration 4870/ 6200 | consumed samples: 4986880 | consumed tokens: 10213130240 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686938E+00 | loss scale: 2048.0 | grad norm: 5.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.697 | TFLOPs: 42.28 | +[default7]: iteration 4871/ 6200 | consumed samples: 4987904 | consumed tokens: 10215227392 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662269E+00 | loss scale: 2048.0 | grad norm: 5.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.741 | TFLOPs: 42.29 | +[default7]: iteration 4872/ 6200 | consumed samples: 4988928 | consumed tokens: 10217324544 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.655127E+00 | loss scale: 2048.0 | grad norm: 5.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.977 | TFLOPs: 42.36 | +[default7]: iteration 4873/ 6200 | consumed samples: 4989952 | consumed tokens: 10219421696 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646954E+00 | loss scale: 2048.0 | grad norm: 5.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.799 | TFLOPs: 42.31 | +[default7]: iteration 4874/ 6200 | consumed samples: 4990976 | consumed tokens: 10221518848 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651398E+00 | loss scale: 2048.0 | grad norm: 5.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.854 | TFLOPs: 42.32 | +[default7]: iteration 4875/ 6200 | consumed samples: 4992000 | consumed tokens: 10223616000 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679060E+00 | loss scale: 2048.0 | grad norm: 5.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.763 | TFLOPs: 42.30 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 4875 | lm loss value: 3.634132E+00 | lm loss PPL: 3.786896E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 4875 | lm loss value: 1.552194E+00 | lm loss PPL: 4.721820E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 4876/ 6200 | consumed samples: 4993024 | consumed tokens: 10225713152 | elapsed time per iteration (s): 66.77 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637046E+00 | loss scale: 2048.0 | grad norm: 4.815 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 15.336 | TFLOPs: 4.67 | +[default7]: iteration 4877/ 6200 | consumed samples: 4994048 | consumed tokens: 10227810304 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.678381E+00 | loss scale: 2048.0 | grad norm: 4.766 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.671 | TFLOPs: 42.27 | +[default7]: iteration 4878/ 6200 | consumed samples: 4995072 | consumed tokens: 10229907456 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661313E+00 | loss scale: 2048.0 | grad norm: 5.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.089 | TFLOPs: 42.09 | +[default7]: iteration 4879/ 6200 | consumed samples: 4996096 | consumed tokens: 10232004608 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675840E+00 | loss scale: 2048.0 | grad norm: 5.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.482 | TFLOPs: 42.21 | +[default7]: iteration 4880/ 6200 | consumed samples: 4997120 | consumed tokens: 10234101760 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648292E+00 | loss scale: 2048.0 | grad norm: 4.786 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.636 | TFLOPs: 42.26 | +[default7]: iteration 4881/ 6200 | consumed samples: 4998144 | consumed tokens: 10236198912 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649186E+00 | loss scale: 2048.0 | grad norm: 6.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.559 | TFLOPs: 42.23 | +[default7]: iteration 4882/ 6200 | consumed samples: 4999168 | consumed tokens: 10238296064 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656632E+00 | loss scale: 2048.0 | grad norm: 5.944 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.555 | TFLOPs: 42.23 | +[default7]: iteration 4883/ 6200 | consumed samples: 5000192 | consumed tokens: 10240393216 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635399E+00 | loss scale: 2048.0 | grad norm: 4.879 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.604 | TFLOPs: 42.25 | +[default7]: iteration 4884/ 6200 | consumed samples: 5001216 | consumed tokens: 10242490368 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663489E+00 | loss scale: 2048.0 | grad norm: 5.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.546 | TFLOPs: 42.23 | +[default7]: iteration 4885/ 6200 | consumed samples: 5002240 | consumed tokens: 10244587520 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633376E+00 | loss scale: 2048.0 | grad norm: 5.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.676 | TFLOPs: 42.27 | +[default7]: iteration 4886/ 6200 | consumed samples: 5003264 | consumed tokens: 10246684672 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646985E+00 | loss scale: 2048.0 | grad norm: 5.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.471 | TFLOPs: 42.21 | +[default7]: iteration 4887/ 6200 | consumed samples: 5004288 | consumed tokens: 10248781824 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.699795E+00 | loss scale: 2048.0 | grad norm: 5.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.547 | TFLOPs: 42.23 | +[default7]: iteration 4888/ 6200 | consumed samples: 5005312 | consumed tokens: 10250878976 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648277E+00 | loss scale: 2048.0 | grad norm: 5.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.064 | TFLOPs: 42.39 | +[default7]: iteration 4889/ 6200 | consumed samples: 5006336 | consumed tokens: 10252976128 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648608E+00 | loss scale: 2048.0 | grad norm: 5.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.146 | TFLOPs: 42.41 | +[default7]: iteration 4890/ 6200 | consumed samples: 5007360 | consumed tokens: 10255073280 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659925E+00 | loss scale: 2048.0 | grad norm: 5.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.040 | TFLOPs: 42.38 | +[default7]: iteration 4891/ 6200 | consumed samples: 5008384 | consumed tokens: 10257170432 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669394E+00 | loss scale: 2048.0 | grad norm: 5.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.759 | TFLOPs: 42.30 | +[default7]: iteration 4892/ 6200 | consumed samples: 5009408 | consumed tokens: 10259267584 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631546E+00 | loss scale: 2048.0 | grad norm: 5.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.042 | TFLOPs: 42.38 | +[default7]: iteration 4893/ 6200 | consumed samples: 5010432 | consumed tokens: 10261364736 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648518E+00 | loss scale: 2048.0 | grad norm: 5.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.024 | TFLOPs: 42.38 | +[default7]: iteration 4894/ 6200 | consumed samples: 5011456 | consumed tokens: 10263461888 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630426E+00 | loss scale: 2048.0 | grad norm: 5.724 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.760 | TFLOPs: 42.30 | +[default7]: iteration 4895/ 6200 | consumed samples: 5012480 | consumed tokens: 10265559040 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667619E+00 | loss scale: 2048.0 | grad norm: 7.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.851 | TFLOPs: 42.32 | +[default7]: iteration 4896/ 6200 | consumed samples: 5013504 | consumed tokens: 10267656192 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641073E+00 | loss scale: 2048.0 | grad norm: 5.007 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.761 | TFLOPs: 42.30 | +[default7]: iteration 4897/ 6200 | consumed samples: 5014528 | consumed tokens: 10269753344 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667758E+00 | loss scale: 2048.0 | grad norm: 5.017 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.954 | TFLOPs: 42.35 | +[default7]: iteration 4898/ 6200 | consumed samples: 5015552 | consumed tokens: 10271850496 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649399E+00 | loss scale: 2048.0 | grad norm: 4.960 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.125 | TFLOPs: 42.41 | +[default7]: iteration 4899/ 6200 | consumed samples: 5016576 | consumed tokens: 10273947648 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644065E+00 | loss scale: 2048.0 | grad norm: 6.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.946 | TFLOPs: 42.35 | +[default7]: iteration 4900/ 6200 | consumed samples: 5017600 | consumed tokens: 10276044800 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645957E+00 | loss scale: 2048.0 | grad norm: 5.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.972 | TFLOPs: 42.36 | +[default7]: iteration 4901/ 6200 | consumed samples: 5018624 | consumed tokens: 10278141952 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661490E+00 | loss scale: 2048.0 | grad norm: 4.734 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.719 | TFLOPs: 42.28 | +[default7]: iteration 4902/ 6200 | consumed samples: 5019648 | consumed tokens: 10280239104 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662768E+00 | loss scale: 2048.0 | grad norm: 6.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.996 | TFLOPs: 42.37 | +[default7]: iteration 4903/ 6200 | consumed samples: 5020672 | consumed tokens: 10282336256 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683085E+00 | loss scale: 2048.0 | grad norm: 5.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.094 | TFLOPs: 42.40 | +[default7]: iteration 4904/ 6200 | consumed samples: 5021696 | consumed tokens: 10284433408 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638729E+00 | loss scale: 2048.0 | grad norm: 5.672 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.147 | TFLOPs: 42.41 | +[default7]: iteration 4905/ 6200 | consumed samples: 5022720 | consumed tokens: 10286530560 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637822E+00 | loss scale: 2048.0 | grad norm: 5.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.809 | TFLOPs: 42.31 | +[default7]: iteration 4906/ 6200 | consumed samples: 5023744 | consumed tokens: 10288627712 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664658E+00 | loss scale: 2048.0 | grad norm: 5.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.805 | TFLOPs: 42.31 | +[default7]: iteration 4907/ 6200 | consumed samples: 5024768 | consumed tokens: 10290724864 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644091E+00 | loss scale: 2048.0 | grad norm: 5.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.694 | TFLOPs: 42.28 | +[default7]: iteration 4908/ 6200 | consumed samples: 5025792 | consumed tokens: 10292822016 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635936E+00 | loss scale: 2048.0 | grad norm: 5.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.523 | TFLOPs: 42.22 | +[default7]: iteration 4909/ 6200 | consumed samples: 5026816 | consumed tokens: 10294919168 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654285E+00 | loss scale: 2048.0 | grad norm: 5.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.491 | TFLOPs: 42.21 | +[default7]: iteration 4910/ 6200 | consumed samples: 5027840 | consumed tokens: 10297016320 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.694567E+00 | loss scale: 2048.0 | grad norm: 5.825 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.391 | TFLOPs: 42.18 | +[default7]: iteration 4911/ 6200 | consumed samples: 5028864 | consumed tokens: 10299113472 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652143E+00 | loss scale: 2048.0 | grad norm: 5.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.707 | TFLOPs: 42.28 | +[default7]: iteration 4912/ 6200 | consumed samples: 5029888 | consumed tokens: 10301210624 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661931E+00 | loss scale: 2048.0 | grad norm: 5.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.658 | TFLOPs: 42.26 | +[default7]: iteration 4913/ 6200 | consumed samples: 5030912 | consumed tokens: 10303307776 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680037E+00 | loss scale: 2048.0 | grad norm: 4.992 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.922 | TFLOPs: 42.34 | +[default7]: iteration 4914/ 6200 | consumed samples: 5031936 | consumed tokens: 10305404928 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664065E+00 | loss scale: 2048.0 | grad norm: 5.968 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.977 | TFLOPs: 42.36 | +[default7]: iteration 4915/ 6200 | consumed samples: 5032960 | consumed tokens: 10307502080 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632410E+00 | loss scale: 2048.0 | grad norm: 6.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.765 | TFLOPs: 42.30 | +[default7]: iteration 4916/ 6200 | consumed samples: 5033984 | consumed tokens: 10309599232 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671057E+00 | loss scale: 2048.0 | grad norm: 6.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.277 | TFLOPs: 42.15 | +[default7]: iteration 4917/ 6200 | consumed samples: 5035008 | consumed tokens: 10311696384 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643929E+00 | loss scale: 2048.0 | grad norm: 5.763 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.511 | TFLOPs: 42.22 | +[default7]: iteration 4918/ 6200 | consumed samples: 5036032 | consumed tokens: 10313793536 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672941E+00 | loss scale: 2048.0 | grad norm: 6.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.789 | TFLOPs: 42.30 | +[default7]: iteration 4919/ 6200 | consumed samples: 5037056 | consumed tokens: 10315890688 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681980E+00 | loss scale: 2048.0 | grad norm: 5.726 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.035 | TFLOPs: 42.38 | +[default7]: iteration 4920/ 6200 | consumed samples: 5038080 | consumed tokens: 10317987840 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646707E+00 | loss scale: 2048.0 | grad norm: 5.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.065 | TFLOPs: 42.39 | +[default7]: iteration 4921/ 6200 | consumed samples: 5039104 | consumed tokens: 10320084992 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638856E+00 | loss scale: 2048.0 | grad norm: 5.834 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.029 | TFLOPs: 42.38 | +[default7]: iteration 4922/ 6200 | consumed samples: 5040128 | consumed tokens: 10322182144 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673995E+00 | loss scale: 2048.0 | grad norm: 5.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.969 | TFLOPs: 42.36 | +[default7]: iteration 4923/ 6200 | consumed samples: 5041152 | consumed tokens: 10324279296 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630109E+00 | loss scale: 2048.0 | grad norm: 5.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.996 | TFLOPs: 42.37 | +[default7]: iteration 4924/ 6200 | consumed samples: 5042176 | consumed tokens: 10326376448 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649852E+00 | loss scale: 2048.0 | grad norm: 5.048 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.031 | TFLOPs: 42.38 | +[default7]: iteration 4925/ 6200 | consumed samples: 5043200 | consumed tokens: 10328473600 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654084E+00 | loss scale: 2048.0 | grad norm: 5.601 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.072 | TFLOPs: 42.39 | +[default7]: iteration 4926/ 6200 | consumed samples: 5044224 | consumed tokens: 10330570752 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666840E+00 | loss scale: 2048.0 | grad norm: 5.049 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.194 | TFLOPs: 42.43 | +[default7]: iteration 4927/ 6200 | consumed samples: 5045248 | consumed tokens: 10332667904 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.698743E+00 | loss scale: 2048.0 | grad norm: 6.006 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.439 | TFLOPs: 42.50 | +[default7]: iteration 4928/ 6200 | consumed samples: 5046272 | consumed tokens: 10334765056 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670936E+00 | loss scale: 2048.0 | grad norm: 5.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.246 | TFLOPs: 42.44 | +[default7]: iteration 4929/ 6200 | consumed samples: 5047296 | consumed tokens: 10336862208 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639533E+00 | loss scale: 2048.0 | grad norm: 5.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.255 | TFLOPs: 42.45 | +[default7]: iteration 4930/ 6200 | consumed samples: 5048320 | consumed tokens: 10338959360 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644032E+00 | loss scale: 2048.0 | grad norm: 5.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.067 | TFLOPs: 42.39 | +[default7]: iteration 4931/ 6200 | consumed samples: 5049344 | consumed tokens: 10341056512 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656858E+00 | loss scale: 2048.0 | grad norm: 5.835 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.307 | TFLOPs: 42.46 | +[default7]: iteration 4932/ 6200 | consumed samples: 5050368 | consumed tokens: 10343153664 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662733E+00 | loss scale: 2048.0 | grad norm: 7.019 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.788 | TFLOPs: 42.30 | +[default7]: iteration 4933/ 6200 | consumed samples: 5051392 | consumed tokens: 10345250816 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672614E+00 | loss scale: 2048.0 | grad norm: 5.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.788 | TFLOPs: 42.30 | +[default7]: iteration 4934/ 6200 | consumed samples: 5052416 | consumed tokens: 10347347968 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666344E+00 | loss scale: 2048.0 | grad norm: 5.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.260 | TFLOPs: 42.14 | +[default7]: iteration 4935/ 6200 | consumed samples: 5053440 | consumed tokens: 10349445120 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676632E+00 | loss scale: 2048.0 | grad norm: 5.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.916 | TFLOPs: 42.34 | +[default7]: iteration 4936/ 6200 | consumed samples: 5054464 | consumed tokens: 10351542272 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673281E+00 | loss scale: 2048.0 | grad norm: 4.878 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.056 | TFLOPs: 42.39 | +[default7]: iteration 4937/ 6200 | consumed samples: 5055488 | consumed tokens: 10353639424 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677914E+00 | loss scale: 2048.0 | grad norm: 5.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.147 | TFLOPs: 42.41 | +[default7]: iteration 4938/ 6200 | consumed samples: 5056512 | consumed tokens: 10355736576 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628640E+00 | loss scale: 2048.0 | grad norm: 7.079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.106 | TFLOPs: 42.40 | +[default7]: iteration 4939/ 6200 | consumed samples: 5057536 | consumed tokens: 10357833728 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673874E+00 | loss scale: 2048.0 | grad norm: 5.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.029 | TFLOPs: 42.38 | +[default7]: iteration 4940/ 6200 | consumed samples: 5058560 | consumed tokens: 10359930880 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661070E+00 | loss scale: 2048.0 | grad norm: 5.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.499 | TFLOPs: 42.22 | +[default7]: iteration 4941/ 6200 | consumed samples: 5059584 | consumed tokens: 10362028032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638037E+00 | loss scale: 2048.0 | grad norm: 5.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.956 | TFLOPs: 42.35 | +[default7]: iteration 4942/ 6200 | consumed samples: 5060608 | consumed tokens: 10364125184 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647954E+00 | loss scale: 2048.0 | grad norm: 4.766 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.827 | TFLOPs: 42.32 | +[default7]: iteration 4943/ 6200 | consumed samples: 5061632 | consumed tokens: 10366222336 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645524E+00 | loss scale: 2048.0 | grad norm: 5.999 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.527 | TFLOPs: 42.22 | +[default7]: iteration 4944/ 6200 | consumed samples: 5062656 | consumed tokens: 10368319488 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656194E+00 | loss scale: 2048.0 | grad norm: 6.640 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.519 | TFLOPs: 42.22 | +[default7]: iteration 4945/ 6200 | consumed samples: 5063680 | consumed tokens: 10370416640 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675443E+00 | loss scale: 2048.0 | grad norm: 5.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.822 | TFLOPs: 42.31 | +[default7]: iteration 4946/ 6200 | consumed samples: 5064704 | consumed tokens: 10372513792 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654052E+00 | loss scale: 2048.0 | grad norm: 5.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.027 | TFLOPs: 42.38 | +[default7]: iteration 4947/ 6200 | consumed samples: 5065728 | consumed tokens: 10374610944 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667070E+00 | loss scale: 2048.0 | grad norm: 5.693 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.648 | TFLOPs: 42.26 | +[default7]: iteration 4948/ 6200 | consumed samples: 5066752 | consumed tokens: 10376708096 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662098E+00 | loss scale: 2048.0 | grad norm: 6.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.914 | TFLOPs: 42.34 | +[default7]: iteration 4949/ 6200 | consumed samples: 5067776 | consumed tokens: 10378805248 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650683E+00 | loss scale: 2048.0 | grad norm: 5.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.587 | TFLOPs: 42.24 | +[default7]: iteration 4950/ 6200 | consumed samples: 5068800 | consumed tokens: 10380902400 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653155E+00 | loss scale: 2048.0 | grad norm: 6.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.668 | TFLOPs: 42.27 | +[default7]: iteration 4951/ 6200 | consumed samples: 5069824 | consumed tokens: 10382999552 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657536E+00 | loss scale: 2048.0 | grad norm: 5.569 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.955 | TFLOPs: 42.35 | +[default7]: iteration 4952/ 6200 | consumed samples: 5070848 | consumed tokens: 10385096704 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663529E+00 | loss scale: 2048.0 | grad norm: 4.917 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.986 | TFLOPs: 42.36 | +[default7]: iteration 4953/ 6200 | consumed samples: 5071872 | consumed tokens: 10387193856 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634147E+00 | loss scale: 2048.0 | grad norm: 4.897 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.705 | TFLOPs: 42.28 | +[default7]: iteration 4954/ 6200 | consumed samples: 5072896 | consumed tokens: 10389291008 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665861E+00 | loss scale: 2048.0 | grad norm: 5.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.852 | TFLOPs: 42.32 | +[default7]: iteration 4955/ 6200 | consumed samples: 5073920 | consumed tokens: 10391388160 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666942E+00 | loss scale: 2048.0 | grad norm: 4.791 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.837 | TFLOPs: 42.32 | +[default7]: iteration 4956/ 6200 | consumed samples: 5074944 | consumed tokens: 10393485312 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651070E+00 | loss scale: 2048.0 | grad norm: 5.822 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.899 | TFLOPs: 42.34 | +[default7]: iteration 4957/ 6200 | consumed samples: 5075968 | consumed tokens: 10395582464 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654889E+00 | loss scale: 2048.0 | grad norm: 5.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.734 | TFLOPs: 42.29 | +[default7]: iteration 4958/ 6200 | consumed samples: 5076992 | consumed tokens: 10397679616 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660448E+00 | loss scale: 2048.0 | grad norm: 6.966 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.963 | TFLOPs: 42.36 | +[default7]: iteration 4959/ 6200 | consumed samples: 5078016 | consumed tokens: 10399776768 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621633E+00 | loss scale: 2048.0 | grad norm: 5.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.780 | TFLOPs: 42.30 | +[default7]: iteration 4960/ 6200 | consumed samples: 5079040 | consumed tokens: 10401873920 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681805E+00 | loss scale: 2048.0 | grad norm: 5.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.835 | TFLOPs: 42.32 | +[default7]: iteration 4961/ 6200 | consumed samples: 5080064 | consumed tokens: 10403971072 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647381E+00 | loss scale: 2048.0 | grad norm: 4.996 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.841 | TFLOPs: 42.32 | +[default7]: iteration 4962/ 6200 | consumed samples: 5081088 | consumed tokens: 10406068224 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661519E+00 | loss scale: 2048.0 | grad norm: 6.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.535 | TFLOPs: 42.23 | +[default7]: iteration 4963/ 6200 | consumed samples: 5082112 | consumed tokens: 10408165376 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643014E+00 | loss scale: 2048.0 | grad norm: 5.947 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.565 | TFLOPs: 42.24 | +[default7]: iteration 4964/ 6200 | consumed samples: 5083136 | consumed tokens: 10410262528 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653545E+00 | loss scale: 2048.0 | grad norm: 5.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.746 | TFLOPs: 42.29 | +[default7]: iteration 4965/ 6200 | consumed samples: 5084160 | consumed tokens: 10412359680 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660646E+00 | loss scale: 2048.0 | grad norm: 5.749 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.443 | TFLOPs: 42.20 | +[default7]: iteration 4966/ 6200 | consumed samples: 5085184 | consumed tokens: 10414456832 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675983E+00 | loss scale: 2048.0 | grad norm: 5.917 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.670 | TFLOPs: 42.27 | +[default7]: iteration 4967/ 6200 | consumed samples: 5086208 | consumed tokens: 10416553984 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644981E+00 | loss scale: 2048.0 | grad norm: 5.644 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.039 | TFLOPs: 42.38 | +[default7]: iteration 4968/ 6200 | consumed samples: 5087232 | consumed tokens: 10418651136 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653289E+00 | loss scale: 2048.0 | grad norm: 5.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.386 | TFLOPs: 42.18 | +[default7]: iteration 4969/ 6200 | consumed samples: 5088256 | consumed tokens: 10420748288 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644640E+00 | loss scale: 2048.0 | grad norm: 5.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.601 | TFLOPs: 42.25 | +[default7]: iteration 4970/ 6200 | consumed samples: 5089280 | consumed tokens: 10422845440 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650844E+00 | loss scale: 2048.0 | grad norm: 5.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.634 | TFLOPs: 42.26 | +[default7]: iteration 4971/ 6200 | consumed samples: 5090304 | consumed tokens: 10424942592 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.603528E+00 | loss scale: 2048.0 | grad norm: 5.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.747 | TFLOPs: 42.29 | +[default7]: iteration 4972/ 6200 | consumed samples: 5091328 | consumed tokens: 10427039744 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659499E+00 | loss scale: 2048.0 | grad norm: 5.605 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.657 | TFLOPs: 42.26 | +[default7]: iteration 4973/ 6200 | consumed samples: 5092352 | consumed tokens: 10429136896 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665925E+00 | loss scale: 2048.0 | grad norm: 6.047 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.537 | TFLOPs: 42.23 | +[default7]: iteration 4974/ 6200 | consumed samples: 5093376 | consumed tokens: 10431234048 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615874E+00 | loss scale: 2048.0 | grad norm: 5.542 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.710 | TFLOPs: 42.28 | +[default7]: iteration 4975/ 6200 | consumed samples: 5094400 | consumed tokens: 10433331200 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638258E+00 | loss scale: 2048.0 | grad norm: 4.855 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.276 | TFLOPs: 42.15 | +[default7]: iteration 4976/ 6200 | consumed samples: 5095424 | consumed tokens: 10435428352 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675761E+00 | loss scale: 2048.0 | grad norm: 5.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.266 | TFLOPs: 42.14 | +[default7]: iteration 4977/ 6200 | consumed samples: 5096448 | consumed tokens: 10437525504 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647925E+00 | loss scale: 2048.0 | grad norm: 6.026 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.497 | TFLOPs: 42.22 | +[default7]: iteration 4978/ 6200 | consumed samples: 5097472 | consumed tokens: 10439622656 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668400E+00 | loss scale: 2048.0 | grad norm: 5.678 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.578 | TFLOPs: 42.24 | +[default7]: iteration 4979/ 6200 | consumed samples: 5098496 | consumed tokens: 10441719808 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667452E+00 | loss scale: 2048.0 | grad norm: 4.866 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.493 | TFLOPs: 42.21 | +[default7]: iteration 4980/ 6200 | consumed samples: 5099520 | consumed tokens: 10443816960 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649434E+00 | loss scale: 2048.0 | grad norm: 4.988 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.639 | TFLOPs: 42.26 | +[default7]: iteration 4981/ 6200 | consumed samples: 5100544 | consumed tokens: 10445914112 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662794E+00 | loss scale: 2048.0 | grad norm: 5.011 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.586 | TFLOPs: 42.24 | +[default7]: iteration 4982/ 6200 | consumed samples: 5101568 | consumed tokens: 10448011264 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643472E+00 | loss scale: 2048.0 | grad norm: 5.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.686 | TFLOPs: 42.27 | +[default7]: iteration 4983/ 6200 | consumed samples: 5102592 | consumed tokens: 10450108416 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660071E+00 | loss scale: 2048.0 | grad norm: 5.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.200 | TFLOPs: 42.12 | +[default7]: iteration 4984/ 6200 | consumed samples: 5103616 | consumed tokens: 10452205568 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664060E+00 | loss scale: 2048.0 | grad norm: 4.802 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.656 | TFLOPs: 42.26 | +[default7]: iteration 4985/ 6200 | consumed samples: 5104640 | consumed tokens: 10454302720 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662457E+00 | loss scale: 2048.0 | grad norm: 5.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.849 | TFLOPs: 42.32 | +[default7]: iteration 4986/ 6200 | consumed samples: 5105664 | consumed tokens: 10456399872 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653561E+00 | loss scale: 2048.0 | grad norm: 5.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.863 | TFLOPs: 42.33 | +[default7]: iteration 4987/ 6200 | consumed samples: 5106688 | consumed tokens: 10458497024 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650254E+00 | loss scale: 2048.0 | grad norm: 4.998 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.510 | TFLOPs: 42.22 | +[default7]: iteration 4988/ 6200 | consumed samples: 5107712 | consumed tokens: 10460594176 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637106E+00 | loss scale: 2048.0 | grad norm: 5.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.509 | TFLOPs: 42.22 | +[default7]: iteration 4989/ 6200 | consumed samples: 5108736 | consumed tokens: 10462691328 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642685E+00 | loss scale: 2048.0 | grad norm: 5.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.719 | TFLOPs: 42.28 | +[default7]: iteration 4990/ 6200 | consumed samples: 5109760 | consumed tokens: 10464788480 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664093E+00 | loss scale: 2048.0 | grad norm: 6.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.397 | TFLOPs: 42.18 | +[default7]: iteration 4991/ 6200 | consumed samples: 5110784 | consumed tokens: 10466885632 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666158E+00 | loss scale: 2048.0 | grad norm: 5.762 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.500 | TFLOPs: 42.22 | +[default7]: iteration 4992/ 6200 | consumed samples: 5111808 | consumed tokens: 10468982784 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630949E+00 | loss scale: 2048.0 | grad norm: 5.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.839 | TFLOPs: 42.32 | +[default7]: iteration 4993/ 6200 | consumed samples: 5112832 | consumed tokens: 10471079936 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633727E+00 | loss scale: 2048.0 | grad norm: 6.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.298 | TFLOPs: 42.46 | +[default7]: iteration 4994/ 6200 | consumed samples: 5113856 | consumed tokens: 10473177088 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669101E+00 | loss scale: 2048.0 | grad norm: 4.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.033 | TFLOPs: 42.38 | +[default7]: iteration 4995/ 6200 | consumed samples: 5114880 | consumed tokens: 10475274240 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632700E+00 | loss scale: 2048.0 | grad norm: 5.737 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.009 | TFLOPs: 42.37 | +[default7]: iteration 4996/ 6200 | consumed samples: 5115904 | consumed tokens: 10477371392 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675477E+00 | loss scale: 2048.0 | grad norm: 5.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.183 | TFLOPs: 42.42 | +[default7]: iteration 4997/ 6200 | consumed samples: 5116928 | consumed tokens: 10479468544 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649887E+00 | loss scale: 2048.0 | grad norm: 5.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.334 | TFLOPs: 42.47 | +[default7]: iteration 4998/ 6200 | consumed samples: 5117952 | consumed tokens: 10481565696 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667048E+00 | loss scale: 2048.0 | grad norm: 5.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.364 | TFLOPs: 42.48 | +[default7]: iteration 4999/ 6200 | consumed samples: 5118976 | consumed tokens: 10483662848 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.655320E+00 | loss scale: 2048.0 | grad norm: 6.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.264 | TFLOPs: 42.45 | +[default7]: iteration 5000/ 6200 | consumed samples: 5120000 | consumed tokens: 10485760000 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672805E+00 | loss scale: 2048.0 | grad norm: 5.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.179 | TFLOPs: 42.42 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 5000 | lm loss value: 3.651329E+00 | lm loss PPL: 3.852583E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 5000 | lm loss value: 1.545961E+00 | lm loss PPL: 4.692481E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 5000 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-06 23:29:10,707] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5000 is begin to save! +[default0]:[2022-10-06 23:29:10,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_01-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_01-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_03-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_03-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_04-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_04-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_05-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_05-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_06-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_06-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,222] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_07-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_07-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,249] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_08-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_08-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,276] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_09-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_09-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,303] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_10-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_10-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,331] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_11-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_11-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_12-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_12-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,387] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_13-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_13-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,415] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_14-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_14-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_15-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_15-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_16-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_16-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,497] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_17-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_17-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_18-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_18-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,554] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_19-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_19-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,581] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_20-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_20-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,608] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_21-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_21-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_22-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_22-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,666] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_23-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_23-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_24-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_24-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_25-model_00-model_states.pt... +[default3]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default7]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 23:29:11,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_25-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,752] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_26-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_26-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,781] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_28-model_00-model_states.pt... +[default0]:[2022-10-06 23:29:11,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/layer_28-model_00-model_states.pt. +[default0]:[2022-10-06 23:29:11,782] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/mp_rank_00_model_states.pt +[default0]:[2022-10-06 23:29:11,782] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/mp_rank_00_model_states.pt... +[default0]:[2022-10-06 23:29:11,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/mp_rank_00_model_states.pt. +[default0]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default3]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default0]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default4]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default5]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default1]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default2]:[2022-10-06 23:29:11,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default6]:[2022-10-06 23:29:11,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 23:29:11,974] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 23:29:11,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 23:29:12,001] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 23:29:12,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 23:29:12,000] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 23:29:12,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 23:29:12,090] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 23:29:12,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 23:29:12,008] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 23:29:12,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 23:29:12,067] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 23:29:12,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 23:29:12,038] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 23:29:12,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 23:29:12,019] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 23:29:12,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 23:29:12,111] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 23:29:12,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 23:29:12,113] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 23:29:12,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 23:29:12,132] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 23:29:12,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 23:29:12,110] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 23:29:12,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 23:29:12,091] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 23:29:12,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 23:29:12,125] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 23:29:12,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 23:29:12,126] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 23:29:12,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 23:29:12,116] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 23:29:12,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 23:29:12,118] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 23:29:12,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 23:29:12,114] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 23:29:12,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 23:29:12,121] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 23:29:12,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 23:29:12,132] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 23:29:12,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 23:29:12,091] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 23:29:12,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 23:29:12,112] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 23:29:12,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 23:29:12,132] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 23:29:12,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 23:29:12,126] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default7]:[2022-10-06 23:29:12,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-06 23:29:12,208] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default6]:[2022-10-06 23:29:12,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-06 23:29:12,193] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default0]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default0]: successfully saved checkpoint at iteration 5000 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default2]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default2]:[2022-10-06 23:29:12,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-06 23:29:12,181] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default2]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default0]:[2022-10-06 23:29:12,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-06 23:29:12,209] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default0]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default6]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default3]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default2]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default3]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default1]:[2022-10-06 23:29:12,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-06 23:29:12,218] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default3]:[2022-10-06 23:29:12,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-06 23:29:12,221] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default3]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default6]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default5]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default5]:[2022-10-06 23:29:12,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-06 23:29:12,236] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default0]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default4]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default5]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default1]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default6]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default2]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default3]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default7]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default4]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default4]:[2022-10-06 23:29:12,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-06 23:29:12,233] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5000/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default7]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default7]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default7]:time (ms) | save-checkpoint: 1530.72 +[default5]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default4]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default7]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default1]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default6]:[2022-10-06 23:29:12,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +[default7]: iteration 5001/ 6200 | consumed samples: 5121024 | consumed tokens: 10487857152 | elapsed time per iteration (s): 53.91 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665586E+00 | loss scale: 2048.0 | grad norm: 4.886 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 18.995 | TFLOPs: 5.79 | +[default7]: iteration 5002/ 6200 | consumed samples: 5122048 | consumed tokens: 10489954304 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661341E+00 | loss scale: 2048.0 | grad norm: 5.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.959 | TFLOPs: 42.36 | +[default7]: iteration 5003/ 6200 | consumed samples: 5123072 | consumed tokens: 10492051456 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653654E+00 | loss scale: 2048.0 | grad norm: 5.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.017 | TFLOPs: 42.37 | +[default7]: iteration 5004/ 6200 | consumed samples: 5124096 | consumed tokens: 10494148608 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665349E+00 | loss scale: 2048.0 | grad norm: 6.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.089 | TFLOPs: 42.40 | +[default7]: iteration 5005/ 6200 | consumed samples: 5125120 | consumed tokens: 10496245760 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666338E+00 | loss scale: 2048.0 | grad norm: 5.582 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.041 | TFLOPs: 42.38 | +[default7]: iteration 5006/ 6200 | consumed samples: 5126144 | consumed tokens: 10498342912 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653349E+00 | loss scale: 2048.0 | grad norm: 4.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.237 | TFLOPs: 42.44 | +[default7]: iteration 5007/ 6200 | consumed samples: 5127168 | consumed tokens: 10500440064 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662352E+00 | loss scale: 2048.0 | grad norm: 5.907 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.445 | TFLOPs: 42.50 | +[default7]: iteration 5008/ 6200 | consumed samples: 5128192 | consumed tokens: 10502537216 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677123E+00 | loss scale: 2048.0 | grad norm: 6.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.320 | TFLOPs: 42.47 | +[default7]: iteration 5009/ 6200 | consumed samples: 5129216 | consumed tokens: 10504634368 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630956E+00 | loss scale: 2048.0 | grad norm: 4.816 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.870 | TFLOPs: 42.33 | +[default7]: iteration 5010/ 6200 | consumed samples: 5130240 | consumed tokens: 10506731520 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638366E+00 | loss scale: 2048.0 | grad norm: 5.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.184 | TFLOPs: 42.42 | +[default7]: iteration 5011/ 6200 | consumed samples: 5131264 | consumed tokens: 10508828672 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652288E+00 | loss scale: 2048.0 | grad norm: 6.009 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.027 | TFLOPs: 42.38 | +[default7]: iteration 5012/ 6200 | consumed samples: 5132288 | consumed tokens: 10510925824 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683236E+00 | loss scale: 2048.0 | grad norm: 5.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.006 | TFLOPs: 42.37 | +[default7]: iteration 5013/ 6200 | consumed samples: 5133312 | consumed tokens: 10513022976 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646016E+00 | loss scale: 2048.0 | grad norm: 5.013 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.891 | TFLOPs: 42.34 | +[default7]: iteration 5014/ 6200 | consumed samples: 5134336 | consumed tokens: 10515120128 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662704E+00 | loss scale: 2048.0 | grad norm: 5.602 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.817 | TFLOPs: 42.31 | +[default7]: iteration 5015/ 6200 | consumed samples: 5135360 | consumed tokens: 10517217280 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662527E+00 | loss scale: 2048.0 | grad norm: 4.955 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.023 | TFLOPs: 42.38 | +[default7]: iteration 5016/ 6200 | consumed samples: 5136384 | consumed tokens: 10519314432 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638228E+00 | loss scale: 2048.0 | grad norm: 5.571 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.314 | TFLOPs: 42.46 | +[default7]: iteration 5017/ 6200 | consumed samples: 5137408 | consumed tokens: 10521411584 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654284E+00 | loss scale: 2048.0 | grad norm: 4.961 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.379 | TFLOPs: 42.48 | +[default7]: iteration 5018/ 6200 | consumed samples: 5138432 | consumed tokens: 10523508736 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653158E+00 | loss scale: 2048.0 | grad norm: 4.706 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.373 | TFLOPs: 42.48 | +[default7]: iteration 5019/ 6200 | consumed samples: 5139456 | consumed tokens: 10525605888 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666469E+00 | loss scale: 2048.0 | grad norm: 4.625 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.437 | TFLOPs: 42.50 | +[default7]: iteration 5020/ 6200 | consumed samples: 5140480 | consumed tokens: 10527703040 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670549E+00 | loss scale: 2048.0 | grad norm: 5.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.395 | TFLOPs: 42.49 | +[default7]: iteration 5021/ 6200 | consumed samples: 5141504 | consumed tokens: 10529800192 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663939E+00 | loss scale: 2048.0 | grad norm: 6.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.775 | TFLOPs: 42.30 | +[default7]: iteration 5022/ 6200 | consumed samples: 5142528 | consumed tokens: 10531897344 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651625E+00 | loss scale: 2048.0 | grad norm: 5.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.796 | TFLOPs: 42.31 | +[default7]: iteration 5023/ 6200 | consumed samples: 5143552 | consumed tokens: 10533994496 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638299E+00 | loss scale: 2048.0 | grad norm: 6.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.611 | TFLOPs: 42.25 | +[default7]: iteration 5024/ 6200 | consumed samples: 5144576 | consumed tokens: 10536091648 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622721E+00 | loss scale: 2048.0 | grad norm: 5.782 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.142 | TFLOPs: 42.41 | +[default7]: iteration 5025/ 6200 | consumed samples: 5145600 | consumed tokens: 10538188800 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647425E+00 | loss scale: 2048.0 | grad norm: 5.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.375 | TFLOPs: 42.48 | +[default7]: iteration 5026/ 6200 | consumed samples: 5146624 | consumed tokens: 10540285952 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667942E+00 | loss scale: 2048.0 | grad norm: 5.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.278 | TFLOPs: 42.45 | +[default7]: iteration 5027/ 6200 | consumed samples: 5147648 | consumed tokens: 10542383104 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662695E+00 | loss scale: 2048.0 | grad norm: 5.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.315 | TFLOPs: 42.46 | +[default7]: iteration 5028/ 6200 | consumed samples: 5148672 | consumed tokens: 10544480256 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631247E+00 | loss scale: 2048.0 | grad norm: 7.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.432 | TFLOPs: 42.50 | +[default7]: iteration 5029/ 6200 | consumed samples: 5149696 | consumed tokens: 10546577408 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652844E+00 | loss scale: 2048.0 | grad norm: 5.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.183 | TFLOPs: 42.42 | +[default7]: iteration 5030/ 6200 | consumed samples: 5150720 | consumed tokens: 10548674560 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649050E+00 | loss scale: 2048.0 | grad norm: 5.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.606 | TFLOPs: 42.25 | +[default7]: iteration 5031/ 6200 | consumed samples: 5151744 | consumed tokens: 10550771712 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666394E+00 | loss scale: 2048.0 | grad norm: 5.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.539 | TFLOPs: 42.23 | +[default7]: iteration 5032/ 6200 | consumed samples: 5152768 | consumed tokens: 10552868864 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649674E+00 | loss scale: 2048.0 | grad norm: 6.988 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.474 | TFLOPs: 42.21 | +[default7]: iteration 5033/ 6200 | consumed samples: 5153792 | consumed tokens: 10554966016 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643557E+00 | loss scale: 2048.0 | grad norm: 5.577 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.557 | TFLOPs: 42.23 | +[default7]: iteration 5034/ 6200 | consumed samples: 5154816 | consumed tokens: 10557063168 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668203E+00 | loss scale: 2048.0 | grad norm: 5.841 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.629 | TFLOPs: 42.26 | +[default7]: iteration 5035/ 6200 | consumed samples: 5155840 | consumed tokens: 10559160320 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634295E+00 | loss scale: 2048.0 | grad norm: 4.977 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.490 | TFLOPs: 42.21 | +[default7]: iteration 5036/ 6200 | consumed samples: 5156864 | consumed tokens: 10561257472 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642852E+00 | loss scale: 2048.0 | grad norm: 5.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.838 | TFLOPs: 42.32 | +[default7]: iteration 5037/ 6200 | consumed samples: 5157888 | consumed tokens: 10563354624 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632683E+00 | loss scale: 2048.0 | grad norm: 5.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.722 | TFLOPs: 42.28 | +[default7]: iteration 5038/ 6200 | consumed samples: 5158912 | consumed tokens: 10565451776 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652863E+00 | loss scale: 2048.0 | grad norm: 5.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.549 | TFLOPs: 42.23 | +[default7]: iteration 5039/ 6200 | consumed samples: 5159936 | consumed tokens: 10567548928 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666425E+00 | loss scale: 2048.0 | grad norm: 5.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.684 | TFLOPs: 42.27 | +[default7]: iteration 5040/ 6200 | consumed samples: 5160960 | consumed tokens: 10569646080 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661246E+00 | loss scale: 2048.0 | grad norm: 4.834 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.613 | TFLOPs: 42.25 | +[default7]: iteration 5041/ 6200 | consumed samples: 5161984 | consumed tokens: 10571743232 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642242E+00 | loss scale: 2048.0 | grad norm: 6.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.562 | TFLOPs: 42.23 | +[default7]: iteration 5042/ 6200 | consumed samples: 5163008 | consumed tokens: 10573840384 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634227E+00 | loss scale: 2048.0 | grad norm: 4.869 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.471 | TFLOPs: 42.21 | +[default7]: iteration 5043/ 6200 | consumed samples: 5164032 | consumed tokens: 10575937536 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661394E+00 | loss scale: 2048.0 | grad norm: 5.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.497 | TFLOPs: 42.22 | +[default7]: iteration 5044/ 6200 | consumed samples: 5165056 | consumed tokens: 10578034688 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643128E+00 | loss scale: 2048.0 | grad norm: 6.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.664 | TFLOPs: 42.27 | +[default7]: iteration 5045/ 6200 | consumed samples: 5166080 | consumed tokens: 10580131840 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672230E+00 | loss scale: 2048.0 | grad norm: 5.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.696 | TFLOPs: 42.28 | +[default7]: iteration 5046/ 6200 | consumed samples: 5167104 | consumed tokens: 10582228992 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633056E+00 | loss scale: 2048.0 | grad norm: 5.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.765 | TFLOPs: 42.30 | +[default7]: iteration 5047/ 6200 | consumed samples: 5168128 | consumed tokens: 10584326144 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.608810E+00 | loss scale: 2048.0 | grad norm: 5.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.468 | TFLOPs: 42.21 | +[default7]: iteration 5048/ 6200 | consumed samples: 5169152 | consumed tokens: 10586423296 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639667E+00 | loss scale: 2048.0 | grad norm: 5.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.729 | TFLOPs: 42.29 | +[default7]: iteration 5049/ 6200 | consumed samples: 5170176 | consumed tokens: 10588520448 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629503E+00 | loss scale: 2048.0 | grad norm: 5.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.795 | TFLOPs: 42.31 | +[default7]: iteration 5050/ 6200 | consumed samples: 5171200 | consumed tokens: 10590617600 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679657E+00 | loss scale: 2048.0 | grad norm: 5.853 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.581 | TFLOPs: 42.24 | +[default7]: iteration 5051/ 6200 | consumed samples: 5172224 | consumed tokens: 10592714752 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645264E+00 | loss scale: 2048.0 | grad norm: 4.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.645 | TFLOPs: 42.26 | +[default7]: iteration 5052/ 6200 | consumed samples: 5173248 | consumed tokens: 10594811904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646860E+00 | loss scale: 2048.0 | grad norm: 4.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.662 | TFLOPs: 42.27 | +[default7]: iteration 5053/ 6200 | consumed samples: 5174272 | consumed tokens: 10596909056 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628026E+00 | loss scale: 2048.0 | grad norm: 5.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.762 | TFLOPs: 42.30 | +[default7]: iteration 5054/ 6200 | consumed samples: 5175296 | consumed tokens: 10599006208 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656231E+00 | loss scale: 2048.0 | grad norm: 6.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.868 | TFLOPs: 42.33 | +[default7]: iteration 5055/ 6200 | consumed samples: 5176320 | consumed tokens: 10601103360 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641662E+00 | loss scale: 2048.0 | grad norm: 6.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.714 | TFLOPs: 42.28 | +[default7]: iteration 5056/ 6200 | consumed samples: 5177344 | consumed tokens: 10603200512 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659910E+00 | loss scale: 2048.0 | grad norm: 5.679 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.454 | TFLOPs: 42.20 | +[default7]: iteration 5057/ 6200 | consumed samples: 5178368 | consumed tokens: 10605297664 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656070E+00 | loss scale: 2048.0 | grad norm: 5.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.448 | TFLOPs: 42.20 | +[default7]: iteration 5058/ 6200 | consumed samples: 5179392 | consumed tokens: 10607394816 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661574E+00 | loss scale: 2048.0 | grad norm: 5.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.600 | TFLOPs: 42.25 | +[default7]: iteration 5059/ 6200 | consumed samples: 5180416 | consumed tokens: 10609491968 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660037E+00 | loss scale: 2048.0 | grad norm: 5.720 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.861 | TFLOPs: 42.33 | +[default7]: iteration 5060/ 6200 | consumed samples: 5181440 | consumed tokens: 10611589120 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650131E+00 | loss scale: 2048.0 | grad norm: 6.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.715 | TFLOPs: 42.28 | +[default7]: iteration 5061/ 6200 | consumed samples: 5182464 | consumed tokens: 10613686272 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652954E+00 | loss scale: 2048.0 | grad norm: 4.920 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.722 | TFLOPs: 42.28 | +[default7]: iteration 5062/ 6200 | consumed samples: 5183488 | consumed tokens: 10615783424 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663415E+00 | loss scale: 2048.0 | grad norm: 5.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.349 | TFLOPs: 42.17 | +[default7]: iteration 5063/ 6200 | consumed samples: 5184512 | consumed tokens: 10617880576 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.655828E+00 | loss scale: 2048.0 | grad norm: 5.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.983 | TFLOPs: 42.36 | +[default7]: iteration 5064/ 6200 | consumed samples: 5185536 | consumed tokens: 10619977728 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656385E+00 | loss scale: 2048.0 | grad norm: 4.935 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.957 | TFLOPs: 42.36 | +[default7]: iteration 5065/ 6200 | consumed samples: 5186560 | consumed tokens: 10622074880 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631291E+00 | loss scale: 2048.0 | grad norm: 4.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.852 | TFLOPs: 42.32 | +[default7]: iteration 5066/ 6200 | consumed samples: 5187584 | consumed tokens: 10624172032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643125E+00 | loss scale: 2048.0 | grad norm: 5.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.966 | TFLOPs: 42.36 | +[default7]: iteration 5067/ 6200 | consumed samples: 5188608 | consumed tokens: 10626269184 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664042E+00 | loss scale: 2048.0 | grad norm: 4.752 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.689 | TFLOPs: 42.27 | +[default7]: iteration 5068/ 6200 | consumed samples: 5189632 | consumed tokens: 10628366336 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645439E+00 | loss scale: 2048.0 | grad norm: 5.811 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.937 | TFLOPs: 42.35 | +[default7]: iteration 5069/ 6200 | consumed samples: 5190656 | consumed tokens: 10630463488 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662261E+00 | loss scale: 2048.0 | grad norm: 4.671 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.941 | TFLOPs: 42.35 | +[default7]: iteration 5070/ 6200 | consumed samples: 5191680 | consumed tokens: 10632560640 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639251E+00 | loss scale: 2048.0 | grad norm: 5.919 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.009 | TFLOPs: 42.37 | +[default7]: iteration 5071/ 6200 | consumed samples: 5192704 | consumed tokens: 10634657792 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671767E+00 | loss scale: 2048.0 | grad norm: 5.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.873 | TFLOPs: 42.33 | +[default7]: iteration 5072/ 6200 | consumed samples: 5193728 | consumed tokens: 10636754944 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654334E+00 | loss scale: 2048.0 | grad norm: 6.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.075 | TFLOPs: 42.39 | +[default7]: iteration 5073/ 6200 | consumed samples: 5194752 | consumed tokens: 10638852096 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634962E+00 | loss scale: 2048.0 | grad norm: 5.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.875 | TFLOPs: 42.33 | +[default7]: iteration 5074/ 6200 | consumed samples: 5195776 | consumed tokens: 10640949248 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.690253E+00 | loss scale: 2048.0 | grad norm: 5.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.879 | TFLOPs: 42.33 | +[default7]: iteration 5075/ 6200 | consumed samples: 5196800 | consumed tokens: 10643046400 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649294E+00 | loss scale: 2048.0 | grad norm: 6.941 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.911 | TFLOPs: 42.34 | +[default7]: iteration 5076/ 6200 | consumed samples: 5197824 | consumed tokens: 10645143552 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665958E+00 | loss scale: 2048.0 | grad norm: 5.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.996 | TFLOPs: 42.37 | +[default7]: iteration 5077/ 6200 | consumed samples: 5198848 | consumed tokens: 10647240704 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653651E+00 | loss scale: 2048.0 | grad norm: 6.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.082 | TFLOPs: 42.39 | +[default7]: iteration 5078/ 6200 | consumed samples: 5199872 | consumed tokens: 10649337856 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660641E+00 | loss scale: 2048.0 | grad norm: 5.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.996 | TFLOPs: 42.37 | +[default7]: iteration 5079/ 6200 | consumed samples: 5200896 | consumed tokens: 10651435008 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669846E+00 | loss scale: 2048.0 | grad norm: 5.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.696 | TFLOPs: 42.28 | +[default7]: iteration 5080/ 6200 | consumed samples: 5201920 | consumed tokens: 10653532160 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673025E+00 | loss scale: 2048.0 | grad norm: 5.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.337 | TFLOPs: 42.47 | +[default7]: iteration 5081/ 6200 | consumed samples: 5202944 | consumed tokens: 10655629312 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630786E+00 | loss scale: 2048.0 | grad norm: 5.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.417 | TFLOPs: 42.50 | +[default7]: iteration 5082/ 6200 | consumed samples: 5203968 | consumed tokens: 10657726464 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644732E+00 | loss scale: 2048.0 | grad norm: 5.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.623 | TFLOPs: 42.25 | +[default7]: iteration 5083/ 6200 | consumed samples: 5204992 | consumed tokens: 10659823616 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653836E+00 | loss scale: 2048.0 | grad norm: 6.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.266 | TFLOPs: 42.14 | +[default7]: iteration 5084/ 6200 | consumed samples: 5206016 | consumed tokens: 10661920768 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632764E+00 | loss scale: 2048.0 | grad norm: 5.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.439 | TFLOPs: 42.50 | +[default7]: iteration 5085/ 6200 | consumed samples: 5207040 | consumed tokens: 10664017920 | elapsed time per iteration (s): 7.24 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657307E+00 | loss scale: 1024.0 | grad norm: 5.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 141.494 | TFLOPs: 43.13 | +[default0]:[2022-10-06 23:39:38,822] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 +[default7]: iteration 5086/ 6200 | consumed samples: 5208064 | consumed tokens: 10666115072 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638550E+00 | loss scale: 1024.0 | grad norm: 5.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.346 | TFLOPs: 42.47 | +[default7]: iteration 5087/ 6200 | consumed samples: 5209088 | consumed tokens: 10668212224 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651228E+00 | loss scale: 1024.0 | grad norm: 5.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.863 | TFLOPs: 42.33 | +[default7]: iteration 5088/ 6200 | consumed samples: 5210112 | consumed tokens: 10670309376 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636121E+00 | loss scale: 1024.0 | grad norm: 5.847 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.680 | TFLOPs: 42.27 | +[default7]: iteration 5089/ 6200 | consumed samples: 5211136 | consumed tokens: 10672406528 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658419E+00 | loss scale: 1024.0 | grad norm: 5.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.759 | TFLOPs: 42.29 | +[default7]: iteration 5090/ 6200 | consumed samples: 5212160 | consumed tokens: 10674503680 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668829E+00 | loss scale: 1024.0 | grad norm: 5.072 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.713 | TFLOPs: 42.28 | +[default7]: iteration 5091/ 6200 | consumed samples: 5213184 | consumed tokens: 10676600832 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661003E+00 | loss scale: 1024.0 | grad norm: 4.948 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.637 | TFLOPs: 42.26 | +[default7]: iteration 5092/ 6200 | consumed samples: 5214208 | consumed tokens: 10678697984 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635052E+00 | loss scale: 1024.0 | grad norm: 4.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.901 | TFLOPs: 42.34 | +[default7]: iteration 5093/ 6200 | consumed samples: 5215232 | consumed tokens: 10680795136 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.675884E+00 | loss scale: 1024.0 | grad norm: 5.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.883 | TFLOPs: 42.33 | +[default7]: iteration 5094/ 6200 | consumed samples: 5216256 | consumed tokens: 10682892288 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642985E+00 | loss scale: 1024.0 | grad norm: 5.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.078 | TFLOPs: 42.39 | +[default7]: iteration 5095/ 6200 | consumed samples: 5217280 | consumed tokens: 10684989440 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663897E+00 | loss scale: 1024.0 | grad norm: 4.829 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.641 | TFLOPs: 42.26 | +[default7]: iteration 5096/ 6200 | consumed samples: 5218304 | consumed tokens: 10687086592 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633709E+00 | loss scale: 1024.0 | grad norm: 5.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.462 | TFLOPs: 42.20 | +[default7]: iteration 5097/ 6200 | consumed samples: 5219328 | consumed tokens: 10689183744 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631251E+00 | loss scale: 1024.0 | grad norm: 5.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.742 | TFLOPs: 42.29 | +[default7]: iteration 5098/ 6200 | consumed samples: 5220352 | consumed tokens: 10691280896 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658207E+00 | loss scale: 1024.0 | grad norm: 5.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.579 | TFLOPs: 42.24 | +[default7]: iteration 5099/ 6200 | consumed samples: 5221376 | consumed tokens: 10693378048 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647868E+00 | loss scale: 1024.0 | grad norm: 4.993 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.401 | TFLOPs: 42.19 | +[default7]: iteration 5100/ 6200 | consumed samples: 5222400 | consumed tokens: 10695475200 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647106E+00 | loss scale: 1024.0 | grad norm: 5.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.745 | TFLOPs: 42.29 | +[default7]: iteration 5101/ 6200 | consumed samples: 5223424 | consumed tokens: 10697572352 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.655500E+00 | loss scale: 1024.0 | grad norm: 4.820 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.365 | TFLOPs: 42.48 | +[default7]: iteration 5102/ 6200 | consumed samples: 5224448 | consumed tokens: 10699669504 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663818E+00 | loss scale: 1024.0 | grad norm: 5.843 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.391 | TFLOPs: 42.49 | +[default7]: iteration 5103/ 6200 | consumed samples: 5225472 | consumed tokens: 10701766656 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641748E+00 | loss scale: 1024.0 | grad norm: 6.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.419 | TFLOPs: 42.50 | +[default7]: iteration 5104/ 6200 | consumed samples: 5226496 | consumed tokens: 10703863808 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646941E+00 | loss scale: 1024.0 | grad norm: 4.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.195 | TFLOPs: 42.12 | +[default7]: iteration 5105/ 6200 | consumed samples: 5227520 | consumed tokens: 10705960960 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653820E+00 | loss scale: 1024.0 | grad norm: 4.606 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.130 | TFLOPs: 42.41 | +[default7]: iteration 5106/ 6200 | consumed samples: 5228544 | consumed tokens: 10708058112 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687210E+00 | loss scale: 1024.0 | grad norm: 6.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.168 | TFLOPs: 42.42 | +[default7]: iteration 5107/ 6200 | consumed samples: 5229568 | consumed tokens: 10710155264 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633394E+00 | loss scale: 1024.0 | grad norm: 5.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.245 | TFLOPs: 42.44 | +[default7]: iteration 5108/ 6200 | consumed samples: 5230592 | consumed tokens: 10712252416 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651970E+00 | loss scale: 1024.0 | grad norm: 5.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.897 | TFLOPs: 42.34 | +[default7]: iteration 5109/ 6200 | consumed samples: 5231616 | consumed tokens: 10714349568 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636062E+00 | loss scale: 1024.0 | grad norm: 5.065 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.845 | TFLOPs: 42.32 | +[default7]: iteration 5110/ 6200 | consumed samples: 5232640 | consumed tokens: 10716446720 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644599E+00 | loss scale: 1024.0 | grad norm: 6.775 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.644 | TFLOPs: 42.26 | +[default7]: iteration 5111/ 6200 | consumed samples: 5233664 | consumed tokens: 10718543872 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.682484E+00 | loss scale: 1024.0 | grad norm: 5.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.945 | TFLOPs: 42.35 | +[default7]: iteration 5112/ 6200 | consumed samples: 5234688 | consumed tokens: 10720641024 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663677E+00 | loss scale: 1024.0 | grad norm: 5.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.818 | TFLOPs: 42.31 | +[default7]: iteration 5113/ 6200 | consumed samples: 5235712 | consumed tokens: 10722738176 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646886E+00 | loss scale: 1024.0 | grad norm: 5.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.758 | TFLOPs: 42.29 | +[default7]: iteration 5114/ 6200 | consumed samples: 5236736 | consumed tokens: 10724835328 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637564E+00 | loss scale: 1024.0 | grad norm: 5.960 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.995 | TFLOPs: 42.37 | +[default7]: iteration 5115/ 6200 | consumed samples: 5237760 | consumed tokens: 10726932480 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637822E+00 | loss scale: 1024.0 | grad norm: 5.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.109 | TFLOPs: 42.40 | +[default7]: iteration 5116/ 6200 | consumed samples: 5238784 | consumed tokens: 10729029632 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638731E+00 | loss scale: 1024.0 | grad norm: 5.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.816 | TFLOPs: 42.31 | +[default7]: iteration 5117/ 6200 | consumed samples: 5239808 | consumed tokens: 10731126784 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619422E+00 | loss scale: 1024.0 | grad norm: 5.071 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.697 | TFLOPs: 42.28 | +[default7]: iteration 5118/ 6200 | consumed samples: 5240832 | consumed tokens: 10733223936 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639971E+00 | loss scale: 1024.0 | grad norm: 5.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.812 | TFLOPs: 42.31 | +[default7]: iteration 5119/ 6200 | consumed samples: 5241856 | consumed tokens: 10735321088 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654355E+00 | loss scale: 1024.0 | grad norm: 5.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.185 | TFLOPs: 42.42 | +[default7]: iteration 5120/ 6200 | consumed samples: 5242880 | consumed tokens: 10737418240 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648114E+00 | loss scale: 1024.0 | grad norm: 5.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.822 | TFLOPs: 42.31 | +[default7]: iteration 5121/ 6200 | consumed samples: 5243904 | consumed tokens: 10739515392 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642689E+00 | loss scale: 1024.0 | grad norm: 6.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.205 | TFLOPs: 42.43 | +[default7]: iteration 5122/ 6200 | consumed samples: 5244928 | consumed tokens: 10741612544 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659555E+00 | loss scale: 1024.0 | grad norm: 7.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.326 | TFLOPs: 42.47 | +[default7]: iteration 5123/ 6200 | consumed samples: 5245952 | consumed tokens: 10743709696 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671024E+00 | loss scale: 1024.0 | grad norm: 5.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.438 | TFLOPs: 42.50 | +[default7]: iteration 5124/ 6200 | consumed samples: 5246976 | consumed tokens: 10745806848 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.693233E+00 | loss scale: 1024.0 | grad norm: 5.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.553 | TFLOPs: 42.54 | +[default7]: iteration 5125/ 6200 | consumed samples: 5248000 | consumed tokens: 10747904000 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677115E+00 | loss scale: 1024.0 | grad norm: 5.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.223 | TFLOPs: 42.44 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 5125 | lm loss value: 3.681888E+00 | lm loss PPL: 3.972133E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 5125 | lm loss value: 1.544245E+00 | lm loss PPL: 4.684435E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 5126/ 6200 | consumed samples: 5249024 | consumed tokens: 10750001152 | elapsed time per iteration (s): 51.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630834E+00 | loss scale: 1024.0 | grad norm: 5.735 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.932 | TFLOPs: 6.08 | +[default7]: iteration 5127/ 6200 | consumed samples: 5250048 | consumed tokens: 10752098304 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654926E+00 | loss scale: 1024.0 | grad norm: 5.062 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.270 | TFLOPs: 42.45 | +[default7]: iteration 5128/ 6200 | consumed samples: 5251072 | consumed tokens: 10754195456 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652653E+00 | loss scale: 1024.0 | grad norm: 5.775 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.134 | TFLOPs: 42.10 | +[default7]: iteration 5129/ 6200 | consumed samples: 5252096 | consumed tokens: 10756292608 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656572E+00 | loss scale: 1024.0 | grad norm: 4.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.143 | TFLOPs: 42.41 | +[default7]: iteration 5130/ 6200 | consumed samples: 5253120 | consumed tokens: 10758389760 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643521E+00 | loss scale: 1024.0 | grad norm: 4.864 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.333 | TFLOPs: 42.47 | +[default7]: iteration 5131/ 6200 | consumed samples: 5254144 | consumed tokens: 10760486912 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665297E+00 | loss scale: 1024.0 | grad norm: 5.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.374 | TFLOPs: 42.48 | +[default7]: iteration 5132/ 6200 | consumed samples: 5255168 | consumed tokens: 10762584064 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626985E+00 | loss scale: 1024.0 | grad norm: 6.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.492 | TFLOPs: 42.21 | +[default7]: iteration 5133/ 6200 | consumed samples: 5256192 | consumed tokens: 10764681216 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677858E+00 | loss scale: 1024.0 | grad norm: 4.906 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.819 | TFLOPs: 42.31 | +[default7]: iteration 5134/ 6200 | consumed samples: 5257216 | consumed tokens: 10766778368 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622086E+00 | loss scale: 1024.0 | grad norm: 4.998 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.214 | TFLOPs: 42.43 | +[default7]: iteration 5135/ 6200 | consumed samples: 5258240 | consumed tokens: 10768875520 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636114E+00 | loss scale: 1024.0 | grad norm: 4.788 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.790 | TFLOPs: 42.30 | +[default7]: iteration 5136/ 6200 | consumed samples: 5259264 | consumed tokens: 10770972672 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661904E+00 | loss scale: 1024.0 | grad norm: 6.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.716 | TFLOPs: 42.28 | +[default7]: iteration 5137/ 6200 | consumed samples: 5260288 | consumed tokens: 10773069824 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637485E+00 | loss scale: 1024.0 | grad norm: 5.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.937 | TFLOPs: 42.35 | +[default7]: iteration 5138/ 6200 | consumed samples: 5261312 | consumed tokens: 10775166976 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680716E+00 | loss scale: 1024.0 | grad norm: 5.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.957 | TFLOPs: 42.36 | +[default7]: iteration 5139/ 6200 | consumed samples: 5262336 | consumed tokens: 10777264128 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650687E+00 | loss scale: 1024.0 | grad norm: 6.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.651 | TFLOPs: 42.26 | +[default7]: iteration 5140/ 6200 | consumed samples: 5263360 | consumed tokens: 10779361280 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661216E+00 | loss scale: 1024.0 | grad norm: 7.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.845 | TFLOPs: 42.32 | +[default7]: iteration 5141/ 6200 | consumed samples: 5264384 | consumed tokens: 10781458432 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652768E+00 | loss scale: 1024.0 | grad norm: 5.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.787 | TFLOPs: 42.30 | +[default7]: iteration 5142/ 6200 | consumed samples: 5265408 | consumed tokens: 10783555584 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666217E+00 | loss scale: 1024.0 | grad norm: 4.820 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.957 | TFLOPs: 42.36 | +[default7]: iteration 5143/ 6200 | consumed samples: 5266432 | consumed tokens: 10785652736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620571E+00 | loss scale: 1024.0 | grad norm: 6.622 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.602 | TFLOPs: 42.25 | +[default7]: iteration 5144/ 6200 | consumed samples: 5267456 | consumed tokens: 10787749888 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670356E+00 | loss scale: 1024.0 | grad norm: 5.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.826 | TFLOPs: 42.32 | +[default7]: iteration 5145/ 6200 | consumed samples: 5268480 | consumed tokens: 10789847040 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617732E+00 | loss scale: 1024.0 | grad norm: 5.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.379 | TFLOPs: 42.48 | +[default7]: iteration 5146/ 6200 | consumed samples: 5269504 | consumed tokens: 10791944192 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666132E+00 | loss scale: 1024.0 | grad norm: 5.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.369 | TFLOPs: 42.48 | +[default7]: iteration 5147/ 6200 | consumed samples: 5270528 | consumed tokens: 10794041344 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653627E+00 | loss scale: 1024.0 | grad norm: 5.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.082 | TFLOPs: 42.39 | +[default7]: iteration 5148/ 6200 | consumed samples: 5271552 | consumed tokens: 10796138496 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671299E+00 | loss scale: 1024.0 | grad norm: 5.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.597 | TFLOPs: 42.25 | +[default7]: iteration 5149/ 6200 | consumed samples: 5272576 | consumed tokens: 10798235648 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651143E+00 | loss scale: 1024.0 | grad norm: 5.632 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.802 | TFLOPs: 42.31 | +[default7]: iteration 5150/ 6200 | consumed samples: 5273600 | consumed tokens: 10800332800 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619358E+00 | loss scale: 1024.0 | grad norm: 4.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.464 | TFLOPs: 42.20 | +[default7]: iteration 5151/ 6200 | consumed samples: 5274624 | consumed tokens: 10802429952 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640765E+00 | loss scale: 1024.0 | grad norm: 5.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.339 | TFLOPs: 42.17 | +[default7]: iteration 5152/ 6200 | consumed samples: 5275648 | consumed tokens: 10804527104 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652484E+00 | loss scale: 1024.0 | grad norm: 6.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.203 | TFLOPs: 42.43 | +[default7]: iteration 5153/ 6200 | consumed samples: 5276672 | consumed tokens: 10806624256 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.606425E+00 | loss scale: 1024.0 | grad norm: 5.033 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.325 | TFLOPs: 42.47 | +[default7]: iteration 5154/ 6200 | consumed samples: 5277696 | consumed tokens: 10808721408 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.655967E+00 | loss scale: 1024.0 | grad norm: 5.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.257 | TFLOPs: 42.14 | +[default7]: iteration 5155/ 6200 | consumed samples: 5278720 | consumed tokens: 10810818560 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635406E+00 | loss scale: 1024.0 | grad norm: 7.723 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.732 | TFLOPs: 42.29 | +[default7]: iteration 5156/ 6200 | consumed samples: 5279744 | consumed tokens: 10812915712 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661981E+00 | loss scale: 1024.0 | grad norm: 5.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.285 | TFLOPs: 42.15 | +[default7]: iteration 5157/ 6200 | consumed samples: 5280768 | consumed tokens: 10815012864 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645224E+00 | loss scale: 1024.0 | grad norm: 5.655 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.261 | TFLOPs: 42.45 | +[default7]: iteration 5158/ 6200 | consumed samples: 5281792 | consumed tokens: 10817110016 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647600E+00 | loss scale: 1024.0 | grad norm: 5.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.259 | TFLOPs: 42.45 | +[default7]: iteration 5159/ 6200 | consumed samples: 5282816 | consumed tokens: 10819207168 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.674168E+00 | loss scale: 1024.0 | grad norm: 5.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.273 | TFLOPs: 42.45 | +[default7]: iteration 5160/ 6200 | consumed samples: 5283840 | consumed tokens: 10821304320 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631416E+00 | loss scale: 1024.0 | grad norm: 4.820 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.695 | TFLOPs: 42.28 | +[default7]: iteration 5161/ 6200 | consumed samples: 5284864 | consumed tokens: 10823401472 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659175E+00 | loss scale: 1024.0 | grad norm: 6.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.151 | TFLOPs: 42.41 | +[default7]: iteration 5162/ 6200 | consumed samples: 5285888 | consumed tokens: 10825498624 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624324E+00 | loss scale: 1024.0 | grad norm: 4.984 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.109 | TFLOPs: 42.40 | +[default7]: iteration 5163/ 6200 | consumed samples: 5286912 | consumed tokens: 10827595776 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636400E+00 | loss scale: 1024.0 | grad norm: 5.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.201 | TFLOPs: 42.43 | +[default7]: iteration 5164/ 6200 | consumed samples: 5287936 | consumed tokens: 10829692928 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640281E+00 | loss scale: 1024.0 | grad norm: 5.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.951 | TFLOPs: 42.35 | +[default7]: iteration 5165/ 6200 | consumed samples: 5288960 | consumed tokens: 10831790080 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621634E+00 | loss scale: 1024.0 | grad norm: 5.765 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.127 | TFLOPs: 42.41 | +[default7]: iteration 5166/ 6200 | consumed samples: 5289984 | consumed tokens: 10833887232 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623006E+00 | loss scale: 1024.0 | grad norm: 5.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.219 | TFLOPs: 42.44 | +[default7]: iteration 5167/ 6200 | consumed samples: 5291008 | consumed tokens: 10835984384 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653875E+00 | loss scale: 1024.0 | grad norm: 5.928 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.272 | TFLOPs: 42.45 | +[default7]: iteration 5168/ 6200 | consumed samples: 5292032 | consumed tokens: 10838081536 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628190E+00 | loss scale: 1024.0 | grad norm: 5.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.410 | TFLOPs: 42.19 | +[default7]: iteration 5169/ 6200 | consumed samples: 5293056 | consumed tokens: 10840178688 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650778E+00 | loss scale: 1024.0 | grad norm: 6.013 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.050 | TFLOPs: 42.38 | +[default7]: iteration 5170/ 6200 | consumed samples: 5294080 | consumed tokens: 10842275840 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654338E+00 | loss scale: 1024.0 | grad norm: 5.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.118 | TFLOPs: 42.40 | +[default7]: iteration 5171/ 6200 | consumed samples: 5295104 | consumed tokens: 10844372992 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635146E+00 | loss scale: 1024.0 | grad norm: 5.728 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.198 | TFLOPs: 42.43 | +[default7]: iteration 5172/ 6200 | consumed samples: 5296128 | consumed tokens: 10846470144 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.686059E+00 | loss scale: 1024.0 | grad norm: 6.039 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.412 | TFLOPs: 42.49 | +[default7]: iteration 5173/ 6200 | consumed samples: 5297152 | consumed tokens: 10848567296 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641774E+00 | loss scale: 1024.0 | grad norm: 5.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.372 | TFLOPs: 42.48 | +[default7]: iteration 5174/ 6200 | consumed samples: 5298176 | consumed tokens: 10850664448 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646158E+00 | loss scale: 1024.0 | grad norm: 4.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.110 | TFLOPs: 42.40 | +[default7]: iteration 5175/ 6200 | consumed samples: 5299200 | consumed tokens: 10852761600 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614946E+00 | loss scale: 1024.0 | grad norm: 5.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.343 | TFLOPs: 42.47 | +[default7]: iteration 5176/ 6200 | consumed samples: 5300224 | consumed tokens: 10854858752 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669881E+00 | loss scale: 1024.0 | grad norm: 6.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.209 | TFLOPs: 42.43 | +[default7]: iteration 5177/ 6200 | consumed samples: 5301248 | consumed tokens: 10856955904 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632664E+00 | loss scale: 1024.0 | grad norm: 4.816 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.023 | TFLOPs: 42.38 | +[default7]: iteration 5178/ 6200 | consumed samples: 5302272 | consumed tokens: 10859053056 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644812E+00 | loss scale: 1024.0 | grad norm: 5.070 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.863 | TFLOPs: 42.33 | +[default7]: iteration 5179/ 6200 | consumed samples: 5303296 | consumed tokens: 10861150208 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626472E+00 | loss scale: 1024.0 | grad norm: 5.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.551 | TFLOPs: 42.23 | +[default7]: iteration 5180/ 6200 | consumed samples: 5304320 | consumed tokens: 10863247360 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658169E+00 | loss scale: 1024.0 | grad norm: 5.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.498 | TFLOPs: 42.22 | +[default7]: iteration 5181/ 6200 | consumed samples: 5305344 | consumed tokens: 10865344512 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656195E+00 | loss scale: 1024.0 | grad norm: 4.906 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.777 | TFLOPs: 42.30 | +[default7]: iteration 5182/ 6200 | consumed samples: 5306368 | consumed tokens: 10867441664 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.684023E+00 | loss scale: 1024.0 | grad norm: 6.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.715 | TFLOPs: 42.28 | +[default7]: iteration 5183/ 6200 | consumed samples: 5307392 | consumed tokens: 10869538816 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642813E+00 | loss scale: 1024.0 | grad norm: 5.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.719 | TFLOPs: 42.28 | +[default7]: iteration 5184/ 6200 | consumed samples: 5308416 | consumed tokens: 10871635968 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631289E+00 | loss scale: 1024.0 | grad norm: 5.786 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.495 | TFLOPs: 42.21 | +[default7]: iteration 5185/ 6200 | consumed samples: 5309440 | consumed tokens: 10873733120 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650715E+00 | loss scale: 1024.0 | grad norm: 4.918 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.521 | TFLOPs: 42.22 | +[default7]: iteration 5186/ 6200 | consumed samples: 5310464 | consumed tokens: 10875830272 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659163E+00 | loss scale: 1024.0 | grad norm: 5.698 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.744 | TFLOPs: 42.29 | +[default7]: iteration 5187/ 6200 | consumed samples: 5311488 | consumed tokens: 10877927424 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673228E+00 | loss scale: 1024.0 | grad norm: 6.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.598 | TFLOPs: 42.25 | +[default7]: iteration 5188/ 6200 | consumed samples: 5312512 | consumed tokens: 10880024576 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660745E+00 | loss scale: 1024.0 | grad norm: 5.049 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.515 | TFLOPs: 42.22 | +[default7]: iteration 5189/ 6200 | consumed samples: 5313536 | consumed tokens: 10882121728 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628016E+00 | loss scale: 1024.0 | grad norm: 6.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.695 | TFLOPs: 42.28 | +[default7]: iteration 5190/ 6200 | consumed samples: 5314560 | consumed tokens: 10884218880 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654924E+00 | loss scale: 1024.0 | grad norm: 4.969 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.674 | TFLOPs: 42.27 | +[default7]: iteration 5191/ 6200 | consumed samples: 5315584 | consumed tokens: 10886316032 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662863E+00 | loss scale: 1024.0 | grad norm: 6.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.639 | TFLOPs: 42.26 | +[default7]: iteration 5192/ 6200 | consumed samples: 5316608 | consumed tokens: 10888413184 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645747E+00 | loss scale: 1024.0 | grad norm: 5.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.679 | TFLOPs: 42.27 | +[default7]: iteration 5193/ 6200 | consumed samples: 5317632 | consumed tokens: 10890510336 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631596E+00 | loss scale: 1024.0 | grad norm: 5.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.577 | TFLOPs: 42.24 | +[default7]: iteration 5194/ 6200 | consumed samples: 5318656 | consumed tokens: 10892607488 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638822E+00 | loss scale: 1024.0 | grad norm: 5.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.865 | TFLOPs: 42.33 | +[default7]: iteration 5195/ 6200 | consumed samples: 5319680 | consumed tokens: 10894704640 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624432E+00 | loss scale: 1024.0 | grad norm: 5.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.950 | TFLOPs: 42.35 | +[default7]: iteration 5196/ 6200 | consumed samples: 5320704 | consumed tokens: 10896801792 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635506E+00 | loss scale: 1024.0 | grad norm: 5.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.721 | TFLOPs: 42.28 | +[default7]: iteration 5197/ 6200 | consumed samples: 5321728 | consumed tokens: 10898898944 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652173E+00 | loss scale: 1024.0 | grad norm: 6.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.827 | TFLOPs: 42.32 | +[default7]: iteration 5198/ 6200 | consumed samples: 5322752 | consumed tokens: 10900996096 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636647E+00 | loss scale: 1024.0 | grad norm: 5.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.687 | TFLOPs: 42.27 | +[default7]: iteration 5199/ 6200 | consumed samples: 5323776 | consumed tokens: 10903093248 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669660E+00 | loss scale: 1024.0 | grad norm: 7.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.458 | TFLOPs: 42.20 | +[default7]: iteration 5200/ 6200 | consumed samples: 5324800 | consumed tokens: 10905190400 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648269E+00 | loss scale: 1024.0 | grad norm: 5.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.603 | TFLOPs: 41.94 | +[default7]: iteration 5201/ 6200 | consumed samples: 5325824 | consumed tokens: 10907287552 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635034E+00 | loss scale: 1024.0 | grad norm: 5.729 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.456 | TFLOPs: 42.20 | +[default7]: iteration 5202/ 6200 | consumed samples: 5326848 | consumed tokens: 10909384704 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652673E+00 | loss scale: 1024.0 | grad norm: 5.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.361 | TFLOPs: 42.17 | +[default7]: iteration 5203/ 6200 | consumed samples: 5327872 | consumed tokens: 10911481856 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653290E+00 | loss scale: 1024.0 | grad norm: 6.108 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.224 | TFLOPs: 42.13 | +[default7]: iteration 5204/ 6200 | consumed samples: 5328896 | consumed tokens: 10913579008 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643826E+00 | loss scale: 1024.0 | grad norm: 5.882 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.735 | TFLOPs: 41.98 | +[default7]: iteration 5205/ 6200 | consumed samples: 5329920 | consumed tokens: 10915676160 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.607067E+00 | loss scale: 1024.0 | grad norm: 6.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.451 | TFLOPs: 42.20 | +[default7]: iteration 5206/ 6200 | consumed samples: 5330944 | consumed tokens: 10917773312 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641818E+00 | loss scale: 1024.0 | grad norm: 6.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.466 | TFLOPs: 42.21 | +[default7]: iteration 5207/ 6200 | consumed samples: 5331968 | consumed tokens: 10919870464 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647367E+00 | loss scale: 1024.0 | grad norm: 5.768 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.586 | TFLOPs: 42.24 | +[default7]: iteration 5208/ 6200 | consumed samples: 5332992 | consumed tokens: 10921967616 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658635E+00 | loss scale: 1024.0 | grad norm: 4.987 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.739 | TFLOPs: 42.29 | +[default7]: iteration 5209/ 6200 | consumed samples: 5334016 | consumed tokens: 10924064768 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656892E+00 | loss scale: 1024.0 | grad norm: 5.771 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.471 | TFLOPs: 42.21 | +[default7]: iteration 5210/ 6200 | consumed samples: 5335040 | consumed tokens: 10926161920 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647557E+00 | loss scale: 1024.0 | grad norm: 6.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.610 | TFLOPs: 42.25 | +[default7]: iteration 5211/ 6200 | consumed samples: 5336064 | consumed tokens: 10928259072 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643874E+00 | loss scale: 1024.0 | grad norm: 5.522 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.767 | TFLOPs: 42.30 | +[default7]: iteration 5212/ 6200 | consumed samples: 5337088 | consumed tokens: 10930356224 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641028E+00 | loss scale: 1024.0 | grad norm: 5.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.726 | TFLOPs: 42.28 | +[default7]: iteration 5213/ 6200 | consumed samples: 5338112 | consumed tokens: 10932453376 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629973E+00 | loss scale: 1024.0 | grad norm: 4.998 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.529 | TFLOPs: 42.22 | +[default7]: iteration 5214/ 6200 | consumed samples: 5339136 | consumed tokens: 10934550528 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640045E+00 | loss scale: 1024.0 | grad norm: 6.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.651 | TFLOPs: 42.26 | +[default7]: iteration 5215/ 6200 | consumed samples: 5340160 | consumed tokens: 10936647680 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648832E+00 | loss scale: 1024.0 | grad norm: 5.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.389 | TFLOPs: 42.49 | +[default7]: iteration 5216/ 6200 | consumed samples: 5341184 | consumed tokens: 10938744832 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661785E+00 | loss scale: 1024.0 | grad norm: 6.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.300 | TFLOPs: 42.46 | +[default7]: iteration 5217/ 6200 | consumed samples: 5342208 | consumed tokens: 10940841984 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635525E+00 | loss scale: 1024.0 | grad norm: 5.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.302 | TFLOPs: 42.46 | +[default7]: iteration 5218/ 6200 | consumed samples: 5343232 | consumed tokens: 10942939136 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642937E+00 | loss scale: 1024.0 | grad norm: 5.963 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.139 | TFLOPs: 42.41 | +[default7]: iteration 5219/ 6200 | consumed samples: 5344256 | consumed tokens: 10945036288 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652326E+00 | loss scale: 1024.0 | grad norm: 6.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.905 | TFLOPs: 42.34 | +[default7]: iteration 5220/ 6200 | consumed samples: 5345280 | consumed tokens: 10947133440 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656159E+00 | loss scale: 1024.0 | grad norm: 5.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.723 | TFLOPs: 42.28 | +[default7]: iteration 5221/ 6200 | consumed samples: 5346304 | consumed tokens: 10949230592 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672031E+00 | loss scale: 1024.0 | grad norm: 4.753 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.129 | TFLOPs: 42.10 | +[default7]: iteration 5222/ 6200 | consumed samples: 5347328 | consumed tokens: 10951327744 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637750E+00 | loss scale: 1024.0 | grad norm: 4.715 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.797 | TFLOPs: 42.31 | +[default7]: iteration 5223/ 6200 | consumed samples: 5348352 | consumed tokens: 10953424896 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623729E+00 | loss scale: 1024.0 | grad norm: 5.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.962 | TFLOPs: 42.36 | +[default7]: iteration 5224/ 6200 | consumed samples: 5349376 | consumed tokens: 10955522048 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643266E+00 | loss scale: 1024.0 | grad norm: 4.954 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.967 | TFLOPs: 42.36 | +[default7]: iteration 5225/ 6200 | consumed samples: 5350400 | consumed tokens: 10957619200 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667439E+00 | loss scale: 1024.0 | grad norm: 5.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.288 | TFLOPs: 42.15 | +[default7]: iteration 5226/ 6200 | consumed samples: 5351424 | consumed tokens: 10959716352 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649690E+00 | loss scale: 1024.0 | grad norm: 5.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.937 | TFLOPs: 42.35 | +[default7]: iteration 5227/ 6200 | consumed samples: 5352448 | consumed tokens: 10961813504 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634903E+00 | loss scale: 1024.0 | grad norm: 4.635 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.862 | TFLOPs: 42.33 | +[default7]: iteration 5228/ 6200 | consumed samples: 5353472 | consumed tokens: 10963910656 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631069E+00 | loss scale: 1024.0 | grad norm: 5.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.797 | TFLOPs: 42.31 | +[default7]: iteration 5229/ 6200 | consumed samples: 5354496 | consumed tokens: 10966007808 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666375E+00 | loss scale: 1024.0 | grad norm: 4.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.827 | TFLOPs: 42.01 | +[default7]: iteration 5230/ 6200 | consumed samples: 5355520 | consumed tokens: 10968104960 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652024E+00 | loss scale: 1024.0 | grad norm: 4.965 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.091 | TFLOPs: 42.40 | +[default7]: iteration 5231/ 6200 | consumed samples: 5356544 | consumed tokens: 10970202112 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658243E+00 | loss scale: 1024.0 | grad norm: 6.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.813 | TFLOPs: 42.31 | +[default7]: iteration 5232/ 6200 | consumed samples: 5357568 | consumed tokens: 10972299264 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659365E+00 | loss scale: 1024.0 | grad norm: 5.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.660 | TFLOPs: 42.26 | +[default7]: iteration 5233/ 6200 | consumed samples: 5358592 | consumed tokens: 10974396416 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657565E+00 | loss scale: 1024.0 | grad norm: 5.869 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.601 | TFLOPs: 41.94 | +[default7]: iteration 5234/ 6200 | consumed samples: 5359616 | consumed tokens: 10976493568 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.680429E+00 | loss scale: 1024.0 | grad norm: 5.715 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.848 | TFLOPs: 42.02 | +[default7]: iteration 5235/ 6200 | consumed samples: 5360640 | consumed tokens: 10978590720 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637756E+00 | loss scale: 1024.0 | grad norm: 5.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.998 | TFLOPs: 42.37 | +[default7]: iteration 5236/ 6200 | consumed samples: 5361664 | consumed tokens: 10980687872 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657935E+00 | loss scale: 1024.0 | grad norm: 5.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.715 | TFLOPs: 42.28 | +[default7]: iteration 5237/ 6200 | consumed samples: 5362688 | consumed tokens: 10982785024 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650282E+00 | loss scale: 1024.0 | grad norm: 6.060 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.000 | TFLOPs: 42.37 | +[default7]: iteration 5238/ 6200 | consumed samples: 5363712 | consumed tokens: 10984882176 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651749E+00 | loss scale: 1024.0 | grad norm: 5.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.170 | TFLOPs: 42.12 | +[default7]: iteration 5239/ 6200 | consumed samples: 5364736 | consumed tokens: 10986979328 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665430E+00 | loss scale: 1024.0 | grad norm: 6.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.750 | TFLOPs: 42.29 | +[default7]: iteration 5240/ 6200 | consumed samples: 5365760 | consumed tokens: 10989076480 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634206E+00 | loss scale: 1024.0 | grad norm: 5.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.740 | TFLOPs: 42.29 | +[default7]: iteration 5241/ 6200 | consumed samples: 5366784 | consumed tokens: 10991173632 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.683942E+00 | loss scale: 1024.0 | grad norm: 6.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.979 | TFLOPs: 42.36 | +[default7]: iteration 5242/ 6200 | consumed samples: 5367808 | consumed tokens: 10993270784 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630799E+00 | loss scale: 1024.0 | grad norm: 5.891 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.745 | TFLOPs: 42.29 | +[default7]: iteration 5243/ 6200 | consumed samples: 5368832 | consumed tokens: 10995367936 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657659E+00 | loss scale: 1024.0 | grad norm: 5.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.197 | TFLOPs: 42.43 | +[default7]: iteration 5244/ 6200 | consumed samples: 5369856 | consumed tokens: 10997465088 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632439E+00 | loss scale: 1024.0 | grad norm: 5.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.845 | TFLOPs: 42.32 | +[default7]: iteration 5245/ 6200 | consumed samples: 5370880 | consumed tokens: 10999562240 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653740E+00 | loss scale: 1024.0 | grad norm: 5.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.586 | TFLOPs: 42.24 | +[default7]: iteration 5246/ 6200 | consumed samples: 5371904 | consumed tokens: 11001659392 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659695E+00 | loss scale: 1024.0 | grad norm: 6.957 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.630 | TFLOPs: 42.26 | +[default7]: iteration 5247/ 6200 | consumed samples: 5372928 | consumed tokens: 11003756544 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656315E+00 | loss scale: 1024.0 | grad norm: 5.981 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.586 | TFLOPs: 42.24 | +[default7]: iteration 5248/ 6200 | consumed samples: 5373952 | consumed tokens: 11005853696 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648133E+00 | loss scale: 1024.0 | grad norm: 5.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.736 | TFLOPs: 42.29 | +[default7]: iteration 5249/ 6200 | consumed samples: 5374976 | consumed tokens: 11007950848 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660777E+00 | loss scale: 1024.0 | grad norm: 5.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.840 | TFLOPs: 42.32 | +[default7]: iteration 5250/ 6200 | consumed samples: 5376000 | consumed tokens: 11010048000 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632946E+00 | loss scale: 1024.0 | grad norm: 6.943 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.129 | TFLOPs: 42.41 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 5250 | lm loss value: 3.698761E+00 | lm loss PPL: 4.039721E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 5250 | lm loss value: 1.544265E+00 | lm loss PPL: 4.684526E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 5250 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-07 00:01:24,217] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5250 is begin to save! +[default0]:[2022-10-07 00:01:24,222] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,672] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,699] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,728] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,755] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,783] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,813] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,851] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,878] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,904] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,930] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,958] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:24,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:24,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:25,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:25,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:25,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:25,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:25,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:25,133] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:25,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:25,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:25,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:25,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:25,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:25,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:25,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:25,244] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:25,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:25,273] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:25,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:25,300] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:25,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:25,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:25,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:25,355] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 00:01:25,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 00:01:25,357] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/mp_rank_00_model_states.pt +[default0]:[2022-10-07 00:01:25,357] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 00:01:25,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/mp_rank_00_model_states.pt. +[default1]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 00:01:25,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 00:01:25,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 00:01:25,640] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 00:01:25,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 00:01:25,665] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 00:01:25,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 00:01:25,638] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 00:01:25,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 00:01:25,692] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 00:01:25,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 00:01:25,639] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 00:01:25,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 00:01:25,728] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 00:01:25,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 00:01:25,743] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 00:01:25,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 00:01:25,714] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 00:01:25,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 00:01:25,742] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 00:01:25,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 00:01:25,762] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 00:01:25,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 00:01:25,693] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 00:01:25,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 00:01:25,761] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 00:01:25,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 00:01:25,729] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 00:01:25,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 00:01:25,783] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 00:01:25,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 00:01:25,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 00:01:25,733] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 00:01:25,713] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 00:01:25,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 00:01:25,758] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 00:01:25,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 00:01:25,774] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 00:01:25,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 00:01:25,760] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 00:01:25,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 00:01:25,761] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 00:01:25,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 00:01:25,762] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 00:01:25,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 00:01:25,766] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 00:01:25,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 00:01:25,803] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 00:01:25,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 00:01:25,962] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 00:01:25,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 00:01:25,911] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 00:01:25,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 00:01:25,892] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 00:01:25,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 00:01:25,930] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 00:01:25,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 00:01:25,955] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 00:01:25,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 00:01:25,932] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 00:01:25,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 00:01:25,976] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 00:01:26,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 00:01:26,007] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default1]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default6]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default5]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default7]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default0]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default7]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default7]:time (ms) | save-checkpoint: 1925.84 +[default6]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default0]:[2022-10-07 00:01:26,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default3]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default7]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default5]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default7]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default4]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default2]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default3]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default4]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default5]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default0]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default2]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 00:01:26,141] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5250/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default1]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default3]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default0]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default0]: successfully saved checkpoint at iteration 5250 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default3]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default4]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default2]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default4]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default6]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default1]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default5]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default2]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default1]:[2022-10-07 00:01:26,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5250 is ready now! +[default7]: iteration 5251/ 6200 | consumed samples: 5377024 | consumed tokens: 11012145152 | elapsed time per iteration (s): 53.57 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650069E+00 | loss scale: 1024.0 | grad norm: 6.006 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.114 | TFLOPs: 5.83 | +[default7]: iteration 5252/ 6200 | consumed samples: 5378048 | consumed tokens: 11014242304 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644407E+00 | loss scale: 1024.0 | grad norm: 5.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.134 | TFLOPs: 42.41 | +[default7]: iteration 5253/ 6200 | consumed samples: 5379072 | consumed tokens: 11016339456 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670538E+00 | loss scale: 1024.0 | grad norm: 4.930 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.331 | TFLOPs: 42.47 | +[default7]: iteration 5254/ 6200 | consumed samples: 5380096 | consumed tokens: 11018436608 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650981E+00 | loss scale: 1024.0 | grad norm: 6.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.253 | TFLOPs: 42.45 | +[default7]: iteration 5255/ 6200 | consumed samples: 5381120 | consumed tokens: 11020533760 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633090E+00 | loss scale: 1024.0 | grad norm: 6.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.516 | TFLOPs: 42.53 | +[default7]: iteration 5256/ 6200 | consumed samples: 5382144 | consumed tokens: 11022630912 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663581E+00 | loss scale: 1024.0 | grad norm: 4.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.404 | TFLOPs: 42.49 | +[default7]: iteration 5257/ 6200 | consumed samples: 5383168 | consumed tokens: 11024728064 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670956E+00 | loss scale: 1024.0 | grad norm: 5.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.308 | TFLOPs: 42.46 | +[default7]: iteration 5258/ 6200 | consumed samples: 5384192 | consumed tokens: 11026825216 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668337E+00 | loss scale: 1024.0 | grad norm: 5.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.339 | TFLOPs: 42.47 | +[default7]: iteration 5259/ 6200 | consumed samples: 5385216 | consumed tokens: 11028922368 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656033E+00 | loss scale: 1024.0 | grad norm: 5.059 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.255 | TFLOPs: 42.45 | +[default7]: iteration 5260/ 6200 | consumed samples: 5386240 | consumed tokens: 11031019520 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685113E+00 | loss scale: 1024.0 | grad norm: 5.890 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.228 | TFLOPs: 42.44 | +[default7]: iteration 5261/ 6200 | consumed samples: 5387264 | consumed tokens: 11033116672 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635070E+00 | loss scale: 1024.0 | grad norm: 5.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.305 | TFLOPs: 42.46 | +[default7]: iteration 5262/ 6200 | consumed samples: 5388288 | consumed tokens: 11035213824 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631462E+00 | loss scale: 1024.0 | grad norm: 5.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.337 | TFLOPs: 42.47 | +[default7]: iteration 5263/ 6200 | consumed samples: 5389312 | consumed tokens: 11037310976 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666394E+00 | loss scale: 1024.0 | grad norm: 4.758 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.406 | TFLOPs: 42.49 | +[default7]: iteration 5264/ 6200 | consumed samples: 5390336 | consumed tokens: 11039408128 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649167E+00 | loss scale: 1024.0 | grad norm: 6.700 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.492 | TFLOPs: 42.21 | +[default7]: iteration 5265/ 6200 | consumed samples: 5391360 | consumed tokens: 11041505280 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629845E+00 | loss scale: 1024.0 | grad norm: 5.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.212 | TFLOPs: 42.43 | +[default7]: iteration 5266/ 6200 | consumed samples: 5392384 | consumed tokens: 11043602432 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637487E+00 | loss scale: 1024.0 | grad norm: 5.542 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.257 | TFLOPs: 42.45 | +[default7]: iteration 5267/ 6200 | consumed samples: 5393408 | consumed tokens: 11045699584 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624158E+00 | loss scale: 1024.0 | grad norm: 5.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.383 | TFLOPs: 42.18 | +[default7]: iteration 5268/ 6200 | consumed samples: 5394432 | consumed tokens: 11047796736 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679200E+00 | loss scale: 1024.0 | grad norm: 5.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.764 | TFLOPs: 42.30 | +[default7]: iteration 5269/ 6200 | consumed samples: 5395456 | consumed tokens: 11049893888 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653635E+00 | loss scale: 1024.0 | grad norm: 5.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.420 | TFLOPs: 42.50 | +[default7]: iteration 5270/ 6200 | consumed samples: 5396480 | consumed tokens: 11051991040 | elapsed time per iteration (s): 7.51 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634998E+00 | loss scale: 1024.0 | grad norm: 5.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.295 | TFLOPs: 41.54 | +[default7]: iteration 5271/ 6200 | consumed samples: 5397504 | consumed tokens: 11054088192 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636544E+00 | loss scale: 1024.0 | grad norm: 5.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.278 | TFLOPs: 42.15 | +[default7]: iteration 5272/ 6200 | consumed samples: 5398528 | consumed tokens: 11056185344 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641245E+00 | loss scale: 1024.0 | grad norm: 5.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.210 | TFLOPs: 42.13 | +[default7]: iteration 5273/ 6200 | consumed samples: 5399552 | consumed tokens: 11058282496 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625363E+00 | loss scale: 1024.0 | grad norm: 5.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.218 | TFLOPs: 42.43 | +[default7]: iteration 5274/ 6200 | consumed samples: 5400576 | consumed tokens: 11060379648 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669211E+00 | loss scale: 1024.0 | grad norm: 6.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.280 | TFLOPs: 42.45 | +[default7]: iteration 5275/ 6200 | consumed samples: 5401600 | consumed tokens: 11062476800 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630676E+00 | loss scale: 1024.0 | grad norm: 5.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.090 | TFLOPs: 42.40 | +[default7]: iteration 5276/ 6200 | consumed samples: 5402624 | consumed tokens: 11064573952 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638203E+00 | loss scale: 1024.0 | grad norm: 5.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.180 | TFLOPs: 42.12 | +[default7]: iteration 5277/ 6200 | consumed samples: 5403648 | consumed tokens: 11066671104 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667089E+00 | loss scale: 1024.0 | grad norm: 5.733 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.301 | TFLOPs: 42.46 | +[default7]: iteration 5278/ 6200 | consumed samples: 5404672 | consumed tokens: 11068768256 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652274E+00 | loss scale: 1024.0 | grad norm: 6.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.225 | TFLOPs: 42.44 | +[default7]: iteration 5279/ 6200 | consumed samples: 5405696 | consumed tokens: 11070865408 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643988E+00 | loss scale: 1024.0 | grad norm: 7.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.027 | TFLOPs: 42.38 | +[default7]: iteration 5280/ 6200 | consumed samples: 5406720 | consumed tokens: 11072962560 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632861E+00 | loss scale: 1024.0 | grad norm: 5.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.137 | TFLOPs: 42.41 | +[default7]: iteration 5281/ 6200 | consumed samples: 5407744 | consumed tokens: 11075059712 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654214E+00 | loss scale: 1024.0 | grad norm: 7.605 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.189 | TFLOPs: 42.43 | +[default7]: iteration 5282/ 6200 | consumed samples: 5408768 | consumed tokens: 11077156864 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629605E+00 | loss scale: 1024.0 | grad norm: 6.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.997 | TFLOPs: 42.37 | +[default7]: iteration 5283/ 6200 | consumed samples: 5409792 | consumed tokens: 11079254016 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639278E+00 | loss scale: 1024.0 | grad norm: 5.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.050 | TFLOPs: 42.38 | +[default7]: iteration 5284/ 6200 | consumed samples: 5410816 | consumed tokens: 11081351168 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.676698E+00 | loss scale: 1024.0 | grad norm: 5.963 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.132 | TFLOPs: 42.41 | +[default7]: iteration 5285/ 6200 | consumed samples: 5411840 | consumed tokens: 11083448320 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663084E+00 | loss scale: 1024.0 | grad norm: 7.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.990 | TFLOPs: 42.37 | +[default7]: iteration 5286/ 6200 | consumed samples: 5412864 | consumed tokens: 11085545472 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618527E+00 | loss scale: 1024.0 | grad norm: 6.756 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.152 | TFLOPs: 42.41 | +[default7]: iteration 5287/ 6200 | consumed samples: 5413888 | consumed tokens: 11087642624 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644621E+00 | loss scale: 1024.0 | grad norm: 6.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.116 | TFLOPs: 42.40 | +[default7]: iteration 5288/ 6200 | consumed samples: 5414912 | consumed tokens: 11089739776 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.655720E+00 | loss scale: 1024.0 | grad norm: 5.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.074 | TFLOPs: 42.39 | +[default7]: iteration 5289/ 6200 | consumed samples: 5415936 | consumed tokens: 11091836928 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637456E+00 | loss scale: 1024.0 | grad norm: 5.956 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.886 | TFLOPs: 42.33 | +[default7]: iteration 5290/ 6200 | consumed samples: 5416960 | consumed tokens: 11093934080 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641997E+00 | loss scale: 1024.0 | grad norm: 5.841 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.749 | TFLOPs: 42.29 | +[default7]: iteration 5291/ 6200 | consumed samples: 5417984 | consumed tokens: 11096031232 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642622E+00 | loss scale: 1024.0 | grad norm: 5.906 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.028 | TFLOPs: 42.38 | +[default7]: iteration 5292/ 6200 | consumed samples: 5419008 | consumed tokens: 11098128384 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645108E+00 | loss scale: 1024.0 | grad norm: 4.738 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.929 | TFLOPs: 42.35 | +[default7]: iteration 5293/ 6200 | consumed samples: 5420032 | consumed tokens: 11100225536 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654607E+00 | loss scale: 1024.0 | grad norm: 5.894 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.997 | TFLOPs: 42.37 | +[default7]: iteration 5294/ 6200 | consumed samples: 5421056 | consumed tokens: 11102322688 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649512E+00 | loss scale: 1024.0 | grad norm: 4.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.716 | TFLOPs: 42.28 | +[default7]: iteration 5295/ 6200 | consumed samples: 5422080 | consumed tokens: 11104419840 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657574E+00 | loss scale: 1024.0 | grad norm: 4.962 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.603 | TFLOPs: 42.25 | +[default7]: iteration 5296/ 6200 | consumed samples: 5423104 | consumed tokens: 11106516992 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665906E+00 | loss scale: 1024.0 | grad norm: 5.885 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.744 | TFLOPs: 42.29 | +[default7]: iteration 5297/ 6200 | consumed samples: 5424128 | consumed tokens: 11108614144 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634879E+00 | loss scale: 1024.0 | grad norm: 5.786 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.869 | TFLOPs: 42.33 | +[default7]: iteration 5298/ 6200 | consumed samples: 5425152 | consumed tokens: 11110711296 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653296E+00 | loss scale: 1024.0 | grad norm: 5.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.938 | TFLOPs: 42.35 | +[default7]: iteration 5299/ 6200 | consumed samples: 5426176 | consumed tokens: 11112808448 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657735E+00 | loss scale: 1024.0 | grad norm: 5.724 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.642 | TFLOPs: 42.26 | +[default7]: iteration 5300/ 6200 | consumed samples: 5427200 | consumed tokens: 11114905600 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660251E+00 | loss scale: 1024.0 | grad norm: 4.648 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.738 | TFLOPs: 42.29 | +[default7]: iteration 5301/ 6200 | consumed samples: 5428224 | consumed tokens: 11117002752 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.685049E+00 | loss scale: 1024.0 | grad norm: 5.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.852 | TFLOPs: 42.32 | +[default7]: iteration 5302/ 6200 | consumed samples: 5429248 | consumed tokens: 11119099904 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611804E+00 | loss scale: 1024.0 | grad norm: 6.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.028 | TFLOPs: 42.38 | +[default7]: iteration 5303/ 6200 | consumed samples: 5430272 | consumed tokens: 11121197056 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622916E+00 | loss scale: 1024.0 | grad norm: 4.887 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.123 | TFLOPs: 42.41 | +[default7]: iteration 5304/ 6200 | consumed samples: 5431296 | consumed tokens: 11123294208 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634810E+00 | loss scale: 1024.0 | grad norm: 6.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.147 | TFLOPs: 42.41 | +[default7]: iteration 5305/ 6200 | consumed samples: 5432320 | consumed tokens: 11125391360 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634742E+00 | loss scale: 1024.0 | grad norm: 4.783 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.792 | TFLOPs: 42.30 | +[default7]: iteration 5306/ 6200 | consumed samples: 5433344 | consumed tokens: 11127488512 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638938E+00 | loss scale: 1024.0 | grad norm: 5.522 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.636 | TFLOPs: 42.26 | +[default7]: iteration 5307/ 6200 | consumed samples: 5434368 | consumed tokens: 11129585664 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650160E+00 | loss scale: 1024.0 | grad norm: 5.930 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.742 | TFLOPs: 42.29 | +[default7]: iteration 5308/ 6200 | consumed samples: 5435392 | consumed tokens: 11131682816 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653065E+00 | loss scale: 1024.0 | grad norm: 5.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.616 | TFLOPs: 42.25 | +[default7]: iteration 5309/ 6200 | consumed samples: 5436416 | consumed tokens: 11133779968 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642789E+00 | loss scale: 1024.0 | grad norm: 5.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.623 | TFLOPs: 42.25 | +[default7]: iteration 5310/ 6200 | consumed samples: 5437440 | consumed tokens: 11135877120 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661789E+00 | loss scale: 1024.0 | grad norm: 6.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.656 | TFLOPs: 42.26 | +[default7]: iteration 5311/ 6200 | consumed samples: 5438464 | consumed tokens: 11137974272 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650626E+00 | loss scale: 1024.0 | grad norm: 4.875 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.978 | TFLOPs: 42.36 | +[default7]: iteration 5312/ 6200 | consumed samples: 5439488 | consumed tokens: 11140071424 | elapsed time per iteration (s): 7.55 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638465E+00 | loss scale: 1024.0 | grad norm: 4.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 135.543 | TFLOPs: 41.31 | +[default7]: iteration 5313/ 6200 | consumed samples: 5440512 | consumed tokens: 11142168576 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673069E+00 | loss scale: 1024.0 | grad norm: 6.763 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.878 | TFLOPs: 42.33 | +[default7]: iteration 5314/ 6200 | consumed samples: 5441536 | consumed tokens: 11144265728 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640433E+00 | loss scale: 1024.0 | grad norm: 7.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.580 | TFLOPs: 42.24 | +[default7]: iteration 5315/ 6200 | consumed samples: 5442560 | consumed tokens: 11146362880 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668623E+00 | loss scale: 1024.0 | grad norm: 5.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.670 | TFLOPs: 42.27 | +[default7]: iteration 5316/ 6200 | consumed samples: 5443584 | consumed tokens: 11148460032 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652725E+00 | loss scale: 1024.0 | grad norm: 4.710 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.811 | TFLOPs: 42.31 | +[default7]: iteration 5317/ 6200 | consumed samples: 5444608 | consumed tokens: 11150557184 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621357E+00 | loss scale: 1024.0 | grad norm: 5.986 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.793 | TFLOPs: 42.31 | +[default7]: iteration 5318/ 6200 | consumed samples: 5445632 | consumed tokens: 11152654336 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667727E+00 | loss scale: 1024.0 | grad norm: 5.971 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.802 | TFLOPs: 42.31 | +[default7]: iteration 5319/ 6200 | consumed samples: 5446656 | consumed tokens: 11154751488 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640801E+00 | loss scale: 1024.0 | grad norm: 5.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.721 | TFLOPs: 42.28 | +[default7]: iteration 5320/ 6200 | consumed samples: 5447680 | consumed tokens: 11156848640 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.616480E+00 | loss scale: 1024.0 | grad norm: 4.909 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.953 | TFLOPs: 42.35 | +[default7]: iteration 5321/ 6200 | consumed samples: 5448704 | consumed tokens: 11158945792 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620208E+00 | loss scale: 1024.0 | grad norm: 5.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.906 | TFLOPs: 42.34 | +[default7]: iteration 5322/ 6200 | consumed samples: 5449728 | consumed tokens: 11161042944 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643393E+00 | loss scale: 1024.0 | grad norm: 4.949 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.927 | TFLOPs: 42.35 | +[default7]: iteration 5323/ 6200 | consumed samples: 5450752 | consumed tokens: 11163140096 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641448E+00 | loss scale: 1024.0 | grad norm: 5.042 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.961 | TFLOPs: 42.36 | +[default7]: iteration 5324/ 6200 | consumed samples: 5451776 | consumed tokens: 11165237248 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624780E+00 | loss scale: 1024.0 | grad norm: 5.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.565 | TFLOPs: 42.24 | +[default7]: iteration 5325/ 6200 | consumed samples: 5452800 | consumed tokens: 11167334400 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637136E+00 | loss scale: 1024.0 | grad norm: 5.571 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.825 | TFLOPs: 42.32 | +[default7]: iteration 5326/ 6200 | consumed samples: 5453824 | consumed tokens: 11169431552 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652658E+00 | loss scale: 1024.0 | grad norm: 5.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.670 | TFLOPs: 42.27 | +[default7]: iteration 5327/ 6200 | consumed samples: 5454848 | consumed tokens: 11171528704 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633583E+00 | loss scale: 1024.0 | grad norm: 4.991 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.951 | TFLOPs: 42.35 | +[default7]: iteration 5328/ 6200 | consumed samples: 5455872 | consumed tokens: 11173625856 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.616413E+00 | loss scale: 1024.0 | grad norm: 5.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.906 | TFLOPs: 42.34 | +[default7]: iteration 5329/ 6200 | consumed samples: 5456896 | consumed tokens: 11175723008 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638030E+00 | loss scale: 1024.0 | grad norm: 5.992 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.085 | TFLOPs: 42.39 | +[default7]: iteration 5330/ 6200 | consumed samples: 5457920 | consumed tokens: 11177820160 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654881E+00 | loss scale: 1024.0 | grad norm: 5.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.371 | TFLOPs: 42.18 | +[default7]: iteration 5331/ 6200 | consumed samples: 5458944 | consumed tokens: 11179917312 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650078E+00 | loss scale: 1024.0 | grad norm: 4.842 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.753 | TFLOPs: 42.29 | +[default7]: iteration 5332/ 6200 | consumed samples: 5459968 | consumed tokens: 11182014464 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.670435E+00 | loss scale: 1024.0 | grad norm: 4.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.239 | TFLOPs: 42.44 | +[default7]: iteration 5333/ 6200 | consumed samples: 5460992 | consumed tokens: 11184111616 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642432E+00 | loss scale: 1024.0 | grad norm: 4.754 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.372 | TFLOPs: 42.48 | +[default7]: iteration 5334/ 6200 | consumed samples: 5462016 | consumed tokens: 11186208768 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633052E+00 | loss scale: 1024.0 | grad norm: 5.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.188 | TFLOPs: 42.43 | +[default7]: iteration 5335/ 6200 | consumed samples: 5463040 | consumed tokens: 11188305920 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639102E+00 | loss scale: 1024.0 | grad norm: 5.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.301 | TFLOPs: 42.46 | +[default7]: iteration 5336/ 6200 | consumed samples: 5464064 | consumed tokens: 11190403072 | elapsed time per iteration (s): 7.58 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636629E+00 | loss scale: 1024.0 | grad norm: 4.971 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 135.012 | TFLOPs: 41.15 | +[default7]: iteration 5337/ 6200 | consumed samples: 5465088 | consumed tokens: 11192500224 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656969E+00 | loss scale: 1024.0 | grad norm: 5.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.138 | TFLOPs: 42.41 | +[default7]: iteration 5338/ 6200 | consumed samples: 5466112 | consumed tokens: 11194597376 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639574E+00 | loss scale: 1024.0 | grad norm: 5.755 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.073 | TFLOPs: 42.39 | +[default7]: iteration 5339/ 6200 | consumed samples: 5467136 | consumed tokens: 11196694528 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653836E+00 | loss scale: 1024.0 | grad norm: 4.888 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.185 | TFLOPs: 42.42 | +[default7]: iteration 5340/ 6200 | consumed samples: 5468160 | consumed tokens: 11198791680 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638527E+00 | loss scale: 1024.0 | grad norm: 4.950 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.572 | TFLOPs: 41.63 | +[default7]: iteration 5341/ 6200 | consumed samples: 5469184 | consumed tokens: 11200888832 | elapsed time per iteration (s): 7.56 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652238E+00 | loss scale: 1024.0 | grad norm: 5.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 135.417 | TFLOPs: 41.28 | +[default7]: iteration 5342/ 6200 | consumed samples: 5470208 | consumed tokens: 11202985984 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637097E+00 | loss scale: 1024.0 | grad norm: 5.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.763 | TFLOPs: 42.30 | +[default7]: iteration 5343/ 6200 | consumed samples: 5471232 | consumed tokens: 11205083136 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647743E+00 | loss scale: 1024.0 | grad norm: 5.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.019 | TFLOPs: 42.37 | +[default7]: iteration 5344/ 6200 | consumed samples: 5472256 | consumed tokens: 11207180288 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663086E+00 | loss scale: 1024.0 | grad norm: 5.053 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.874 | TFLOPs: 42.33 | +[default7]: iteration 5345/ 6200 | consumed samples: 5473280 | consumed tokens: 11209277440 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641195E+00 | loss scale: 1024.0 | grad norm: 5.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.793 | TFLOPs: 42.31 | +[default7]: iteration 5346/ 6200 | consumed samples: 5474304 | consumed tokens: 11211374592 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.681974E+00 | loss scale: 1024.0 | grad norm: 4.908 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.084 | TFLOPs: 42.39 | +[default7]: iteration 5347/ 6200 | consumed samples: 5475328 | consumed tokens: 11213471744 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647749E+00 | loss scale: 1024.0 | grad norm: 5.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.974 | TFLOPs: 42.36 | +[default7]: iteration 5348/ 6200 | consumed samples: 5476352 | consumed tokens: 11215568896 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648467E+00 | loss scale: 1024.0 | grad norm: 5.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.019 | TFLOPs: 42.37 | +[default7]: iteration 5349/ 6200 | consumed samples: 5477376 | consumed tokens: 11217666048 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622462E+00 | loss scale: 1024.0 | grad norm: 4.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.006 | TFLOPs: 42.37 | +[default7]: iteration 5350/ 6200 | consumed samples: 5478400 | consumed tokens: 11219763200 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635176E+00 | loss scale: 1024.0 | grad norm: 5.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.074 | TFLOPs: 42.39 | +[default7]: iteration 5351/ 6200 | consumed samples: 5479424 | consumed tokens: 11221860352 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633519E+00 | loss scale: 1024.0 | grad norm: 5.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.003 | TFLOPs: 42.37 | +[default7]: iteration 5352/ 6200 | consumed samples: 5480448 | consumed tokens: 11223957504 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635659E+00 | loss scale: 1024.0 | grad norm: 5.635 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.182 | TFLOPs: 42.42 | +[default7]: iteration 5353/ 6200 | consumed samples: 5481472 | consumed tokens: 11226054656 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632834E+00 | loss scale: 1024.0 | grad norm: 5.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.002 | TFLOPs: 42.37 | +[default7]: iteration 5354/ 6200 | consumed samples: 5482496 | consumed tokens: 11228151808 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644179E+00 | loss scale: 1024.0 | grad norm: 5.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.822 | TFLOPs: 42.31 | +[default7]: iteration 5355/ 6200 | consumed samples: 5483520 | consumed tokens: 11230248960 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634996E+00 | loss scale: 1024.0 | grad norm: 6.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.172 | TFLOPs: 42.42 | +[default7]: iteration 5356/ 6200 | consumed samples: 5484544 | consumed tokens: 11232346112 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649489E+00 | loss scale: 1024.0 | grad norm: 4.892 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.060 | TFLOPs: 42.39 | +[default7]: iteration 5357/ 6200 | consumed samples: 5485568 | consumed tokens: 11234443264 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648478E+00 | loss scale: 1024.0 | grad norm: 5.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.098 | TFLOPs: 42.40 | +[default7]: iteration 5358/ 6200 | consumed samples: 5486592 | consumed tokens: 11236540416 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618012E+00 | loss scale: 1024.0 | grad norm: 5.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.112 | TFLOPs: 42.40 | +[default7]: iteration 5359/ 6200 | consumed samples: 5487616 | consumed tokens: 11238637568 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647804E+00 | loss scale: 1024.0 | grad norm: 5.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.110 | TFLOPs: 42.40 | +[default7]: iteration 5360/ 6200 | consumed samples: 5488640 | consumed tokens: 11240734720 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659357E+00 | loss scale: 1024.0 | grad norm: 5.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.808 | TFLOPs: 42.31 | +[default7]: iteration 5361/ 6200 | consumed samples: 5489664 | consumed tokens: 11242831872 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656295E+00 | loss scale: 1024.0 | grad norm: 5.050 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.010 | TFLOPs: 42.37 | +[default7]: iteration 5362/ 6200 | consumed samples: 5490688 | consumed tokens: 11244929024 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628876E+00 | loss scale: 1024.0 | grad norm: 5.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.201 | TFLOPs: 42.43 | +[default7]: iteration 5363/ 6200 | consumed samples: 5491712 | consumed tokens: 11247026176 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636635E+00 | loss scale: 1024.0 | grad norm: 5.948 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.192 | TFLOPs: 42.43 | +[default7]: iteration 5364/ 6200 | consumed samples: 5492736 | consumed tokens: 11249123328 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635017E+00 | loss scale: 1024.0 | grad norm: 5.019 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.079 | TFLOPs: 42.39 | +[default7]: iteration 5365/ 6200 | consumed samples: 5493760 | consumed tokens: 11251220480 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644727E+00 | loss scale: 1024.0 | grad norm: 4.954 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.271 | TFLOPs: 42.45 | +[default7]: iteration 5366/ 6200 | consumed samples: 5494784 | consumed tokens: 11253317632 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658177E+00 | loss scale: 1024.0 | grad norm: 6.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.080 | TFLOPs: 42.39 | +[default7]: iteration 5367/ 6200 | consumed samples: 5495808 | consumed tokens: 11255414784 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.666155E+00 | loss scale: 1024.0 | grad norm: 5.749 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.938 | TFLOPs: 42.35 | +[default7]: iteration 5368/ 6200 | consumed samples: 5496832 | consumed tokens: 11257511936 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646968E+00 | loss scale: 1024.0 | grad norm: 5.049 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.191 | TFLOPs: 42.43 | +[default7]: iteration 5369/ 6200 | consumed samples: 5497856 | consumed tokens: 11259609088 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630806E+00 | loss scale: 1024.0 | grad norm: 5.963 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.146 | TFLOPs: 42.41 | +[default7]: iteration 5370/ 6200 | consumed samples: 5498880 | consumed tokens: 11261706240 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631197E+00 | loss scale: 1024.0 | grad norm: 5.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.649 | TFLOPs: 42.26 | +[default7]: iteration 5371/ 6200 | consumed samples: 5499904 | consumed tokens: 11263803392 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619269E+00 | loss scale: 1024.0 | grad norm: 4.847 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.763 | TFLOPs: 42.30 | +[default7]: iteration 5372/ 6200 | consumed samples: 5500928 | consumed tokens: 11265900544 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652064E+00 | loss scale: 1024.0 | grad norm: 5.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.873 | TFLOPs: 42.33 | +[default7]: iteration 5373/ 6200 | consumed samples: 5501952 | consumed tokens: 11267997696 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.677890E+00 | loss scale: 1024.0 | grad norm: 5.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.787 | TFLOPs: 42.30 | +[default7]: iteration 5374/ 6200 | consumed samples: 5502976 | consumed tokens: 11270094848 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641159E+00 | loss scale: 1024.0 | grad norm: 5.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.678 | TFLOPs: 42.27 | +[default7]: iteration 5375/ 6200 | consumed samples: 5504000 | consumed tokens: 11272192000 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.655691E+00 | loss scale: 1024.0 | grad norm: 5.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.580 | TFLOPs: 42.24 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 5375 | lm loss value: 3.665240E+00 | lm loss PPL: 3.906551E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 5375 | lm loss value: 1.532058E+00 | lm loss PPL: 4.627689E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 5376/ 6200 | consumed samples: 5505024 | consumed tokens: 11274289152 | elapsed time per iteration (s): 53.26 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641899E+00 | loss scale: 1024.0 | grad norm: 4.729 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.228 | TFLOPs: 5.86 | +[default7]: iteration 5377/ 6200 | consumed samples: 5506048 | consumed tokens: 11276386304 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631960E+00 | loss scale: 1024.0 | grad norm: 4.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.711 | TFLOPs: 42.28 | +[default7]: iteration 5378/ 6200 | consumed samples: 5507072 | consumed tokens: 11278483456 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.679195E+00 | loss scale: 1024.0 | grad norm: 5.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.804 | TFLOPs: 42.31 | +[default7]: iteration 5379/ 6200 | consumed samples: 5508096 | consumed tokens: 11280580608 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661300E+00 | loss scale: 1024.0 | grad norm: 4.896 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.732 | TFLOPs: 42.29 | +[default7]: iteration 5380/ 6200 | consumed samples: 5509120 | consumed tokens: 11282677760 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647463E+00 | loss scale: 1024.0 | grad norm: 5.026 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.747 | TFLOPs: 42.29 | +[default7]: iteration 5381/ 6200 | consumed samples: 5510144 | consumed tokens: 11284774912 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635091E+00 | loss scale: 1024.0 | grad norm: 5.673 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.030 | TFLOPs: 42.38 | +[default7]: iteration 5382/ 6200 | consumed samples: 5511168 | consumed tokens: 11286872064 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644892E+00 | loss scale: 1024.0 | grad norm: 5.007 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.049 | TFLOPs: 42.38 | +[default7]: iteration 5383/ 6200 | consumed samples: 5512192 | consumed tokens: 11288969216 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645647E+00 | loss scale: 1024.0 | grad norm: 5.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.931 | TFLOPs: 42.35 | +[default7]: iteration 5384/ 6200 | consumed samples: 5513216 | consumed tokens: 11291066368 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619414E+00 | loss scale: 1024.0 | grad norm: 5.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.071 | TFLOPs: 42.39 | +[default7]: iteration 5385/ 6200 | consumed samples: 5514240 | consumed tokens: 11293163520 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625682E+00 | loss scale: 1024.0 | grad norm: 5.718 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.938 | TFLOPs: 42.35 | +[default7]: iteration 5386/ 6200 | consumed samples: 5515264 | consumed tokens: 11295260672 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632462E+00 | loss scale: 1024.0 | grad norm: 5.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.938 | TFLOPs: 42.35 | +[default7]: iteration 5387/ 6200 | consumed samples: 5516288 | consumed tokens: 11297357824 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665137E+00 | loss scale: 1024.0 | grad norm: 5.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.914 | TFLOPs: 42.34 | +[default7]: iteration 5388/ 6200 | consumed samples: 5517312 | consumed tokens: 11299454976 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630595E+00 | loss scale: 1024.0 | grad norm: 6.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.947 | TFLOPs: 42.35 | +[default7]: iteration 5389/ 6200 | consumed samples: 5518336 | consumed tokens: 11301552128 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.655790E+00 | loss scale: 1024.0 | grad norm: 5.862 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.475 | TFLOPs: 42.21 | +[default7]: iteration 5390/ 6200 | consumed samples: 5519360 | consumed tokens: 11303649280 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673051E+00 | loss scale: 1024.0 | grad norm: 4.831 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.038 | TFLOPs: 42.38 | +[default7]: iteration 5391/ 6200 | consumed samples: 5520384 | consumed tokens: 11305746432 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620160E+00 | loss scale: 1024.0 | grad norm: 5.717 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.079 | TFLOPs: 42.39 | +[default7]: iteration 5392/ 6200 | consumed samples: 5521408 | consumed tokens: 11307843584 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622990E+00 | loss scale: 1024.0 | grad norm: 5.005 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.838 | TFLOPs: 42.32 | +[default7]: iteration 5393/ 6200 | consumed samples: 5522432 | consumed tokens: 11309940736 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644574E+00 | loss scale: 1024.0 | grad norm: 5.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.002 | TFLOPs: 42.37 | +[default7]: iteration 5394/ 6200 | consumed samples: 5523456 | consumed tokens: 11312037888 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667600E+00 | loss scale: 1024.0 | grad norm: 6.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.066 | TFLOPs: 42.39 | +[default7]: iteration 5395/ 6200 | consumed samples: 5524480 | consumed tokens: 11314135040 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609203E+00 | loss scale: 1024.0 | grad norm: 5.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.034 | TFLOPs: 42.38 | +[default7]: iteration 5396/ 6200 | consumed samples: 5525504 | consumed tokens: 11316232192 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643457E+00 | loss scale: 1024.0 | grad norm: 4.725 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.243 | TFLOPs: 42.44 | +[default7]: iteration 5397/ 6200 | consumed samples: 5526528 | consumed tokens: 11318329344 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.664835E+00 | loss scale: 1024.0 | grad norm: 4.778 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.960 | TFLOPs: 42.36 | +[default7]: iteration 5398/ 6200 | consumed samples: 5527552 | consumed tokens: 11320426496 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633552E+00 | loss scale: 1024.0 | grad norm: 5.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.784 | TFLOPs: 42.30 | +[default7]: iteration 5399/ 6200 | consumed samples: 5528576 | consumed tokens: 11322523648 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646813E+00 | loss scale: 1024.0 | grad norm: 4.905 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.814 | TFLOPs: 42.31 | +[default7]: iteration 5400/ 6200 | consumed samples: 5529600 | consumed tokens: 11324620800 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638913E+00 | loss scale: 1024.0 | grad norm: 5.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.991 | TFLOPs: 42.37 | +[default7]: iteration 5401/ 6200 | consumed samples: 5530624 | consumed tokens: 11326717952 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641380E+00 | loss scale: 1024.0 | grad norm: 5.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.970 | TFLOPs: 42.36 | +[default7]: iteration 5402/ 6200 | consumed samples: 5531648 | consumed tokens: 11328815104 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634501E+00 | loss scale: 1024.0 | grad norm: 4.895 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.008 | TFLOPs: 42.37 | +[default7]: iteration 5403/ 6200 | consumed samples: 5532672 | consumed tokens: 11330912256 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647797E+00 | loss scale: 1024.0 | grad norm: 4.916 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.962 | TFLOPs: 42.36 | +[default7]: iteration 5404/ 6200 | consumed samples: 5533696 | consumed tokens: 11333009408 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.672575E+00 | loss scale: 1024.0 | grad norm: 4.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.944 | TFLOPs: 42.35 | +[default7]: iteration 5405/ 6200 | consumed samples: 5534720 | consumed tokens: 11335106560 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671592E+00 | loss scale: 1024.0 | grad norm: 5.938 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.907 | TFLOPs: 42.34 | +[default7]: iteration 5406/ 6200 | consumed samples: 5535744 | consumed tokens: 11337203712 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630630E+00 | loss scale: 1024.0 | grad norm: 8.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.833 | TFLOPs: 42.32 | +[default7]: iteration 5407/ 6200 | consumed samples: 5536768 | consumed tokens: 11339300864 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627234E+00 | loss scale: 1024.0 | grad norm: 6.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.962 | TFLOPs: 42.36 | +[default7]: iteration 5408/ 6200 | consumed samples: 5537792 | consumed tokens: 11341398016 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643424E+00 | loss scale: 1024.0 | grad norm: 5.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.050 | TFLOPs: 42.38 | +[default7]: iteration 5409/ 6200 | consumed samples: 5538816 | consumed tokens: 11343495168 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657758E+00 | loss scale: 1024.0 | grad norm: 6.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.075 | TFLOPs: 42.39 | +[default7]: iteration 5410/ 6200 | consumed samples: 5539840 | consumed tokens: 11345592320 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629389E+00 | loss scale: 1024.0 | grad norm: 5.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.018 | TFLOPs: 42.37 | +[default7]: iteration 5411/ 6200 | consumed samples: 5540864 | consumed tokens: 11347689472 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643821E+00 | loss scale: 1024.0 | grad norm: 6.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.990 | TFLOPs: 42.37 | +[default7]: iteration 5412/ 6200 | consumed samples: 5541888 | consumed tokens: 11349786624 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647005E+00 | loss scale: 1024.0 | grad norm: 5.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.900 | TFLOPs: 42.34 | +[default7]: iteration 5413/ 6200 | consumed samples: 5542912 | consumed tokens: 11351883776 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641602E+00 | loss scale: 1024.0 | grad norm: 6.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.807 | TFLOPs: 42.00 | +[default7]: iteration 5414/ 6200 | consumed samples: 5543936 | consumed tokens: 11353980928 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629645E+00 | loss scale: 1024.0 | grad norm: 6.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.056 | TFLOPs: 42.39 | +[default7]: iteration 5415/ 6200 | consumed samples: 5544960 | consumed tokens: 11356078080 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645029E+00 | loss scale: 1024.0 | grad norm: 4.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.262 | TFLOPs: 42.45 | +[default7]: iteration 5416/ 6200 | consumed samples: 5545984 | consumed tokens: 11358175232 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667168E+00 | loss scale: 1024.0 | grad norm: 5.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.072 | TFLOPs: 42.39 | +[default7]: iteration 5417/ 6200 | consumed samples: 5547008 | consumed tokens: 11360272384 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640194E+00 | loss scale: 1024.0 | grad norm: 4.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.177 | TFLOPs: 42.42 | +[default7]: iteration 5418/ 6200 | consumed samples: 5548032 | consumed tokens: 11362369536 | elapsed time per iteration (s): 7.34 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629935E+00 | loss scale: 1024.0 | grad norm: 6.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.522 | TFLOPs: 42.53 | +[default7]: iteration 5419/ 6200 | consumed samples: 5549056 | consumed tokens: 11364466688 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650652E+00 | loss scale: 1024.0 | grad norm: 5.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.189 | TFLOPs: 42.43 | +[default7]: iteration 5420/ 6200 | consumed samples: 5550080 | consumed tokens: 11366563840 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638604E+00 | loss scale: 1024.0 | grad norm: 5.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.078 | TFLOPs: 42.39 | +[default7]: iteration 5421/ 6200 | consumed samples: 5551104 | consumed tokens: 11368660992 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621754E+00 | loss scale: 1024.0 | grad norm: 5.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.983 | TFLOPs: 42.36 | +[default7]: iteration 5422/ 6200 | consumed samples: 5552128 | consumed tokens: 11370758144 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654252E+00 | loss scale: 1024.0 | grad norm: 4.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.191 | TFLOPs: 42.43 | +[default7]: iteration 5423/ 6200 | consumed samples: 5553152 | consumed tokens: 11372855296 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611287E+00 | loss scale: 1024.0 | grad norm: 5.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.892 | TFLOPs: 42.34 | +[default7]: iteration 5424/ 6200 | consumed samples: 5554176 | consumed tokens: 11374952448 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638955E+00 | loss scale: 1024.0 | grad norm: 5.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.535 | TFLOPs: 41.92 | +[default7]: iteration 5425/ 6200 | consumed samples: 5555200 | consumed tokens: 11377049600 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.687091E+00 | loss scale: 1024.0 | grad norm: 5.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.015 | TFLOPs: 42.07 | +[default7]: iteration 5426/ 6200 | consumed samples: 5556224 | consumed tokens: 11379146752 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648225E+00 | loss scale: 1024.0 | grad norm: 5.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.509 | TFLOPs: 42.22 | +[default7]: iteration 5427/ 6200 | consumed samples: 5557248 | consumed tokens: 11381243904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.600694E+00 | loss scale: 1024.0 | grad norm: 5.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.718 | TFLOPs: 42.28 | +[default7]: iteration 5428/ 6200 | consumed samples: 5558272 | consumed tokens: 11383341056 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620216E+00 | loss scale: 1024.0 | grad norm: 5.047 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.378 | TFLOPs: 42.18 | +[default7]: iteration 5429/ 6200 | consumed samples: 5559296 | consumed tokens: 11385438208 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622764E+00 | loss scale: 1024.0 | grad norm: 5.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.522 | TFLOPs: 42.22 | +[default7]: iteration 5430/ 6200 | consumed samples: 5560320 | consumed tokens: 11387535360 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652286E+00 | loss scale: 1024.0 | grad norm: 6.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.387 | TFLOPs: 42.18 | +[default7]: iteration 5431/ 6200 | consumed samples: 5561344 | consumed tokens: 11389632512 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652284E+00 | loss scale: 1024.0 | grad norm: 5.741 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.744 | TFLOPs: 42.29 | +[default7]: iteration 5432/ 6200 | consumed samples: 5562368 | consumed tokens: 11391729664 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639522E+00 | loss scale: 1024.0 | grad norm: 5.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.265 | TFLOPs: 42.45 | +[default7]: iteration 5433/ 6200 | consumed samples: 5563392 | consumed tokens: 11393826816 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644427E+00 | loss scale: 1024.0 | grad norm: 6.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.212 | TFLOPs: 42.43 | +[default7]: iteration 5434/ 6200 | consumed samples: 5564416 | consumed tokens: 11395923968 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662823E+00 | loss scale: 1024.0 | grad norm: 5.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.366 | TFLOPs: 42.48 | +[default7]: iteration 5435/ 6200 | consumed samples: 5565440 | consumed tokens: 11398021120 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668652E+00 | loss scale: 1024.0 | grad norm: 4.934 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.358 | TFLOPs: 42.48 | +[default7]: iteration 5436/ 6200 | consumed samples: 5566464 | consumed tokens: 11400118272 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634258E+00 | loss scale: 1024.0 | grad norm: 5.661 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.284 | TFLOPs: 42.46 | +[default7]: iteration 5437/ 6200 | consumed samples: 5567488 | consumed tokens: 11402215424 | elapsed time per iteration (s): 7.50 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641881E+00 | loss scale: 1024.0 | grad norm: 7.744 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.454 | TFLOPs: 41.59 | +[default7]: iteration 5438/ 6200 | consumed samples: 5568512 | consumed tokens: 11404312576 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630601E+00 | loss scale: 1024.0 | grad norm: 5.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.064 | TFLOPs: 42.39 | +[default7]: iteration 5439/ 6200 | consumed samples: 5569536 | consumed tokens: 11406409728 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.613842E+00 | loss scale: 1024.0 | grad norm: 5.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.296 | TFLOPs: 42.46 | +[default7]: iteration 5440/ 6200 | consumed samples: 5570560 | consumed tokens: 11408506880 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615100E+00 | loss scale: 1024.0 | grad norm: 5.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.306 | TFLOPs: 42.46 | +[default7]: iteration 5441/ 6200 | consumed samples: 5571584 | consumed tokens: 11410604032 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614892E+00 | loss scale: 1024.0 | grad norm: 5.787 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.232 | TFLOPs: 42.44 | +[default7]: iteration 5442/ 6200 | consumed samples: 5572608 | consumed tokens: 11412701184 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636878E+00 | loss scale: 1024.0 | grad norm: 6.686 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.294 | TFLOPs: 42.46 | +[default7]: iteration 5443/ 6200 | consumed samples: 5573632 | consumed tokens: 11414798336 | elapsed time per iteration (s): 7.47 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662124E+00 | loss scale: 1024.0 | grad norm: 4.780 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.087 | TFLOPs: 41.79 | +[default7]: iteration 5444/ 6200 | consumed samples: 5574656 | consumed tokens: 11416895488 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620827E+00 | loss scale: 1024.0 | grad norm: 6.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.267 | TFLOPs: 42.45 | +[default7]: iteration 5445/ 6200 | consumed samples: 5575680 | consumed tokens: 11418992640 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632553E+00 | loss scale: 1024.0 | grad norm: 5.750 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.113 | TFLOPs: 42.40 | +[default7]: iteration 5446/ 6200 | consumed samples: 5576704 | consumed tokens: 11421089792 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650435E+00 | loss scale: 1024.0 | grad norm: 5.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.207 | TFLOPs: 42.43 | +[default7]: iteration 5447/ 6200 | consumed samples: 5577728 | consumed tokens: 11423186944 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643843E+00 | loss scale: 1024.0 | grad norm: 5.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.239 | TFLOPs: 42.44 | +[default7]: iteration 5448/ 6200 | consumed samples: 5578752 | consumed tokens: 11425284096 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661299E+00 | loss scale: 1024.0 | grad norm: 4.845 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.236 | TFLOPs: 42.44 | +[default7]: iteration 5449/ 6200 | consumed samples: 5579776 | consumed tokens: 11427381248 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654851E+00 | loss scale: 1024.0 | grad norm: 4.758 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.913 | TFLOPs: 42.04 | +[default7]: iteration 5450/ 6200 | consumed samples: 5580800 | consumed tokens: 11429478400 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645768E+00 | loss scale: 1024.0 | grad norm: 5.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.995 | TFLOPs: 42.37 | +[default7]: iteration 5451/ 6200 | consumed samples: 5581824 | consumed tokens: 11431575552 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632055E+00 | loss scale: 1024.0 | grad norm: 5.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.164 | TFLOPs: 42.42 | +[default7]: iteration 5452/ 6200 | consumed samples: 5582848 | consumed tokens: 11433672704 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645110E+00 | loss scale: 1024.0 | grad norm: 5.989 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.211 | TFLOPs: 42.43 | +[default7]: iteration 5453/ 6200 | consumed samples: 5583872 | consumed tokens: 11435769856 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.608095E+00 | loss scale: 1024.0 | grad norm: 7.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.158 | TFLOPs: 42.42 | +[default7]: iteration 5454/ 6200 | consumed samples: 5584896 | consumed tokens: 11437867008 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629677E+00 | loss scale: 1024.0 | grad norm: 5.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.825 | TFLOPs: 42.32 | +[default7]: iteration 5455/ 6200 | consumed samples: 5585920 | consumed tokens: 11439964160 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.673730E+00 | loss scale: 1024.0 | grad norm: 4.992 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.571 | TFLOPs: 42.24 | +[default7]: iteration 5456/ 6200 | consumed samples: 5586944 | consumed tokens: 11442061312 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634604E+00 | loss scale: 1024.0 | grad norm: 4.782 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.938 | TFLOPs: 42.35 | +[default7]: iteration 5457/ 6200 | consumed samples: 5587968 | consumed tokens: 11444158464 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661735E+00 | loss scale: 1024.0 | grad norm: 5.635 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.967 | TFLOPs: 42.36 | +[default7]: iteration 5458/ 6200 | consumed samples: 5588992 | consumed tokens: 11446255616 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652608E+00 | loss scale: 1024.0 | grad norm: 5.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.096 | TFLOPs: 42.40 | +[default7]: iteration 5459/ 6200 | consumed samples: 5590016 | consumed tokens: 11448352768 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.671830E+00 | loss scale: 1024.0 | grad norm: 5.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.973 | TFLOPs: 42.36 | +[default7]: iteration 5460/ 6200 | consumed samples: 5591040 | consumed tokens: 11450449920 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632968E+00 | loss scale: 1024.0 | grad norm: 5.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.805 | TFLOPs: 42.31 | +[default7]: iteration 5461/ 6200 | consumed samples: 5592064 | consumed tokens: 11452547072 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630962E+00 | loss scale: 1024.0 | grad norm: 5.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.063 | TFLOPs: 42.39 | +[default7]: iteration 5462/ 6200 | consumed samples: 5593088 | consumed tokens: 11454644224 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.669572E+00 | loss scale: 1024.0 | grad norm: 5.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.990 | TFLOPs: 42.37 | +[default7]: iteration 5463/ 6200 | consumed samples: 5594112 | consumed tokens: 11456741376 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614079E+00 | loss scale: 1024.0 | grad norm: 5.036 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.456 | TFLOPs: 42.20 | +[default7]: iteration 5464/ 6200 | consumed samples: 5595136 | consumed tokens: 11458838528 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633894E+00 | loss scale: 1024.0 | grad norm: 5.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.369 | TFLOPs: 42.18 | +[default7]: iteration 5465/ 6200 | consumed samples: 5596160 | consumed tokens: 11460935680 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651950E+00 | loss scale: 1024.0 | grad norm: 4.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.240 | TFLOPs: 42.14 | +[default7]: iteration 5466/ 6200 | consumed samples: 5597184 | consumed tokens: 11463032832 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626361E+00 | loss scale: 1024.0 | grad norm: 4.722 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.480 | TFLOPs: 42.21 | +[default7]: iteration 5467/ 6200 | consumed samples: 5598208 | consumed tokens: 11465129984 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640136E+00 | loss scale: 1024.0 | grad norm: 5.016 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.438 | TFLOPs: 42.20 | +[default7]: iteration 5468/ 6200 | consumed samples: 5599232 | consumed tokens: 11467227136 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637062E+00 | loss scale: 1024.0 | grad norm: 5.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.450 | TFLOPs: 42.20 | +[default7]: iteration 5469/ 6200 | consumed samples: 5600256 | consumed tokens: 11469324288 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634980E+00 | loss scale: 1024.0 | grad norm: 5.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.479 | TFLOPs: 42.21 | +[default7]: iteration 5470/ 6200 | consumed samples: 5601280 | consumed tokens: 11471421440 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650156E+00 | loss scale: 1024.0 | grad norm: 5.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.937 | TFLOPs: 42.35 | +[default7]: iteration 5471/ 6200 | consumed samples: 5602304 | consumed tokens: 11473518592 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626594E+00 | loss scale: 1024.0 | grad norm: 5.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.478 | TFLOPs: 42.21 | +[default7]: iteration 5472/ 6200 | consumed samples: 5603328 | consumed tokens: 11475615744 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609577E+00 | loss scale: 1024.0 | grad norm: 6.009 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.584 | TFLOPs: 42.24 | +[default7]: iteration 5473/ 6200 | consumed samples: 5604352 | consumed tokens: 11477712896 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632753E+00 | loss scale: 1024.0 | grad norm: 6.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.287 | TFLOPs: 42.15 | +[default7]: iteration 5474/ 6200 | consumed samples: 5605376 | consumed tokens: 11479810048 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637024E+00 | loss scale: 1024.0 | grad norm: 4.929 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.700 | TFLOPs: 42.28 | +[default7]: iteration 5475/ 6200 | consumed samples: 5606400 | consumed tokens: 11481907200 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638667E+00 | loss scale: 1024.0 | grad norm: 6.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.555 | TFLOPs: 42.23 | +[default7]: iteration 5476/ 6200 | consumed samples: 5607424 | consumed tokens: 11484004352 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.608256E+00 | loss scale: 1024.0 | grad norm: 5.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.403 | TFLOPs: 42.19 | +[default7]: iteration 5477/ 6200 | consumed samples: 5608448 | consumed tokens: 11486101504 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660040E+00 | loss scale: 1024.0 | grad norm: 5.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.497 | TFLOPs: 42.22 | +[default7]: iteration 5478/ 6200 | consumed samples: 5609472 | consumed tokens: 11488198656 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642843E+00 | loss scale: 1024.0 | grad norm: 7.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.573 | TFLOPs: 42.24 | +[default7]: iteration 5479/ 6200 | consumed samples: 5610496 | consumed tokens: 11490295808 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642433E+00 | loss scale: 1024.0 | grad norm: 6.897 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.586 | TFLOPs: 42.24 | +[default7]: iteration 5480/ 6200 | consumed samples: 5611520 | consumed tokens: 11492392960 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645829E+00 | loss scale: 1024.0 | grad norm: 5.764 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.646 | TFLOPs: 42.26 | +[default7]: iteration 5481/ 6200 | consumed samples: 5612544 | consumed tokens: 11494490112 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653185E+00 | loss scale: 1024.0 | grad norm: 4.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.237 | TFLOPs: 42.14 | +[default7]: iteration 5482/ 6200 | consumed samples: 5613568 | consumed tokens: 11496587264 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633616E+00 | loss scale: 1024.0 | grad norm: 5.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.807 | TFLOPs: 42.31 | +[default7]: iteration 5483/ 6200 | consumed samples: 5614592 | consumed tokens: 11498684416 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.596861E+00 | loss scale: 1024.0 | grad norm: 6.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.518 | TFLOPs: 42.22 | +[default7]: iteration 5484/ 6200 | consumed samples: 5615616 | consumed tokens: 11500781568 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654074E+00 | loss scale: 1024.0 | grad norm: 5.911 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.566 | TFLOPs: 42.24 | +[default7]: iteration 5485/ 6200 | consumed samples: 5616640 | consumed tokens: 11502878720 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.607967E+00 | loss scale: 1024.0 | grad norm: 5.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.430 | TFLOPs: 42.19 | +[default7]: iteration 5486/ 6200 | consumed samples: 5617664 | consumed tokens: 11504975872 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640688E+00 | loss scale: 1024.0 | grad norm: 6.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.673 | TFLOPs: 42.27 | +[default7]: iteration 5487/ 6200 | consumed samples: 5618688 | consumed tokens: 11507073024 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662109E+00 | loss scale: 1024.0 | grad norm: 4.966 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.613 | TFLOPs: 42.25 | +[default7]: iteration 5488/ 6200 | consumed samples: 5619712 | consumed tokens: 11509170176 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634609E+00 | loss scale: 1024.0 | grad norm: 5.631 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.577 | TFLOPs: 42.24 | +[default7]: iteration 5489/ 6200 | consumed samples: 5620736 | consumed tokens: 11511267328 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641177E+00 | loss scale: 1024.0 | grad norm: 5.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.497 | TFLOPs: 42.21 | +[default7]: iteration 5490/ 6200 | consumed samples: 5621760 | consumed tokens: 11513364480 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627225E+00 | loss scale: 1024.0 | grad norm: 5.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.120 | TFLOPs: 42.10 | +[default7]: iteration 5491/ 6200 | consumed samples: 5622784 | consumed tokens: 11515461632 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652955E+00 | loss scale: 1024.0 | grad norm: 7.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.352 | TFLOPs: 42.17 | +[default7]: iteration 5492/ 6200 | consumed samples: 5623808 | consumed tokens: 11517558784 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656501E+00 | loss scale: 1024.0 | grad norm: 5.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.494 | TFLOPs: 42.21 | +[default7]: iteration 5493/ 6200 | consumed samples: 5624832 | consumed tokens: 11519655936 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650055E+00 | loss scale: 1024.0 | grad norm: 4.906 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.548 | TFLOPs: 42.23 | +[default7]: iteration 5494/ 6200 | consumed samples: 5625856 | consumed tokens: 11521753088 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657153E+00 | loss scale: 1024.0 | grad norm: 6.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.439 | TFLOPs: 42.20 | +[default7]: iteration 5495/ 6200 | consumed samples: 5626880 | consumed tokens: 11523850240 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659206E+00 | loss scale: 1024.0 | grad norm: 5.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.176 | TFLOPs: 42.12 | +[default7]: iteration 5496/ 6200 | consumed samples: 5627904 | consumed tokens: 11525947392 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639896E+00 | loss scale: 1024.0 | grad norm: 4.906 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.303 | TFLOPs: 42.16 | +[default7]: iteration 5497/ 6200 | consumed samples: 5628928 | consumed tokens: 11528044544 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634129E+00 | loss scale: 1024.0 | grad norm: 5.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.282 | TFLOPs: 42.15 | +[default7]: iteration 5498/ 6200 | consumed samples: 5629952 | consumed tokens: 11530141696 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635617E+00 | loss scale: 1024.0 | grad norm: 5.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.663 | TFLOPs: 42.27 | +[default7]: iteration 5499/ 6200 | consumed samples: 5630976 | consumed tokens: 11532238848 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624660E+00 | loss scale: 1024.0 | grad norm: 7.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.514 | TFLOPs: 42.22 | +[default7]: iteration 5500/ 6200 | consumed samples: 5632000 | consumed tokens: 11534336000 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629408E+00 | loss scale: 1024.0 | grad norm: 6.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.570 | TFLOPs: 42.24 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 5500 | lm loss value: 3.701406E+00 | lm loss PPL: 4.050423E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 5500 | lm loss value: 1.529701E+00 | lm loss PPL: 4.616797E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 5500 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-07 00:33:40,507] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5500 is begin to save! +[default0]:[2022-10-07 00:33:40,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:40,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:40,901] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:40,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:40,934] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:40,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:40,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:40,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:40,999] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,084] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,138] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,223] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,308] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,336] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,363] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,391] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,421] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,449] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,477] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,505] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_25-model_00-model_states.pt... +[default6]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 00:33:41,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 00:33:41,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 00:33:41,594] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/mp_rank_00_model_states.pt +[default0]:[2022-10-07 00:33:41,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 00:33:41,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 00:33:41,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 00:33:41,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 00:33:41,802] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 00:33:41,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 00:33:41,832] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 00:33:41,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 00:33:41,813] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 00:33:41,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 00:33:41,816] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 00:33:41,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 00:33:41,919] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 00:33:41,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 00:33:41,920] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 00:33:41,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 00:33:41,942] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 00:33:41,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 00:33:41,891] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 00:33:41,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 00:33:41,884] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 00:33:41,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 00:33:41,877] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 00:33:41,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 00:33:41,940] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 00:33:41,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 00:33:41,921] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 00:33:41,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 00:33:41,918] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 00:33:41,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 00:33:41,940] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 00:33:41,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 00:33:41,957] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 00:33:41,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 00:33:41,911] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 00:33:41,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 00:33:41,931] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 00:33:41,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 00:33:41,921] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 00:33:41,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 00:33:41,919] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 00:33:41,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 00:33:41,918] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default5]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default3]:[2022-10-07 00:33:41,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 00:33:41,919] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default2]:[2022-10-07 00:33:41,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 00:33:41,934] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default6]:[2022-10-07 00:33:41,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 00:33:41,932] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default4]:[2022-10-07 00:33:41,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 00:33:41,932] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default6]:[2022-10-07 00:33:41,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 00:33:41,985] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default6]:[2022-10-07 00:33:41,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 00:33:41,968] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default3]:[2022-10-07 00:33:41,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 00:33:41,939] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default7]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default7]:[2022-10-07 00:33:41,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 00:33:41,989] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default7]:time (ms) | save-checkpoint: 1495.63 +[default3]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default7]:[2022-10-07 00:33:41,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 00:33:41,962] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default2]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default0]:[2022-10-07 00:33:41,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 00:33:41,984] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default1]:[2022-10-07 00:33:42,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 00:33:42,001] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default5]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default0]:[2022-10-07 00:33:41,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 00:33:41,963] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5500/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default4]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default1]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default0]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default0]: successfully saved checkpoint at iteration 5500 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default3]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default5]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default1]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default4]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default4]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default2]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default1]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default5]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default6]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default2]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default0]:[2022-10-07 00:33:42,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5500 is ready now! +[default7]: iteration 5501/ 6200 | consumed samples: 5633024 | consumed tokens: 11536433152 | elapsed time per iteration (s): 53.04 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640724E+00 | loss scale: 1024.0 | grad norm: 5.644 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.308 | TFLOPs: 5.89 | +[default7]: iteration 5502/ 6200 | consumed samples: 5634048 | consumed tokens: 11538530304 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.613793E+00 | loss scale: 1024.0 | grad norm: 5.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.308 | TFLOPs: 42.16 | +[default7]: iteration 5503/ 6200 | consumed samples: 5635072 | consumed tokens: 11540627456 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.602791E+00 | loss scale: 1024.0 | grad norm: 5.926 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.029 | TFLOPs: 42.07 | +[default7]: iteration 5504/ 6200 | consumed samples: 5636096 | consumed tokens: 11542724608 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.616498E+00 | loss scale: 1024.0 | grad norm: 5.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.118 | TFLOPs: 42.10 | +[default7]: iteration 5505/ 6200 | consumed samples: 5637120 | consumed tokens: 11544821760 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662141E+00 | loss scale: 1024.0 | grad norm: 6.829 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.292 | TFLOPs: 42.15 | +[default7]: iteration 5506/ 6200 | consumed samples: 5638144 | consumed tokens: 11546918912 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620949E+00 | loss scale: 1024.0 | grad norm: 5.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.206 | TFLOPs: 42.13 | +[default7]: iteration 5507/ 6200 | consumed samples: 5639168 | consumed tokens: 11549016064 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643092E+00 | loss scale: 1024.0 | grad norm: 5.721 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.110 | TFLOPs: 42.10 | +[default7]: iteration 5508/ 6200 | consumed samples: 5640192 | consumed tokens: 11551113216 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641051E+00 | loss scale: 1024.0 | grad norm: 5.891 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.804 | TFLOPs: 42.31 | +[default7]: iteration 5509/ 6200 | consumed samples: 5641216 | consumed tokens: 11553210368 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.665603E+00 | loss scale: 1024.0 | grad norm: 4.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.028 | TFLOPs: 42.38 | +[default7]: iteration 5510/ 6200 | consumed samples: 5642240 | consumed tokens: 11555307520 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638144E+00 | loss scale: 1024.0 | grad norm: 4.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.035 | TFLOPs: 42.38 | +[default7]: iteration 5511/ 6200 | consumed samples: 5643264 | consumed tokens: 11557404672 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627235E+00 | loss scale: 1024.0 | grad norm: 4.819 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.240 | TFLOPs: 42.44 | +[default7]: iteration 5512/ 6200 | consumed samples: 5644288 | consumed tokens: 11559501824 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659466E+00 | loss scale: 1024.0 | grad norm: 4.991 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.123 | TFLOPs: 42.41 | +[default7]: iteration 5513/ 6200 | consumed samples: 5645312 | consumed tokens: 11561598976 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636835E+00 | loss scale: 1024.0 | grad norm: 5.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.772 | TFLOPs: 42.30 | +[default7]: iteration 5514/ 6200 | consumed samples: 5646336 | consumed tokens: 11563696128 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632059E+00 | loss scale: 1024.0 | grad norm: 5.754 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.957 | TFLOPs: 42.36 | +[default7]: iteration 5515/ 6200 | consumed samples: 5647360 | consumed tokens: 11565793280 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660130E+00 | loss scale: 1024.0 | grad norm: 4.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.784 | TFLOPs: 42.30 | +[default7]: iteration 5516/ 6200 | consumed samples: 5648384 | consumed tokens: 11567890432 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620330E+00 | loss scale: 1024.0 | grad norm: 5.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.486 | TFLOPs: 42.21 | +[default7]: iteration 5517/ 6200 | consumed samples: 5649408 | consumed tokens: 11569987584 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627530E+00 | loss scale: 1024.0 | grad norm: 6.015 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.417 | TFLOPs: 42.19 | +[default7]: iteration 5518/ 6200 | consumed samples: 5650432 | consumed tokens: 11572084736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609652E+00 | loss scale: 1024.0 | grad norm: 4.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.564 | TFLOPs: 42.24 | +[default7]: iteration 5519/ 6200 | consumed samples: 5651456 | consumed tokens: 11574181888 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651614E+00 | loss scale: 1024.0 | grad norm: 5.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.731 | TFLOPs: 42.29 | +[default7]: iteration 5520/ 6200 | consumed samples: 5652480 | consumed tokens: 11576279040 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633461E+00 | loss scale: 1024.0 | grad norm: 5.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.218 | TFLOPs: 42.13 | +[default7]: iteration 5521/ 6200 | consumed samples: 5653504 | consumed tokens: 11578376192 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628767E+00 | loss scale: 1024.0 | grad norm: 4.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.456 | TFLOPs: 42.20 | +[default7]: iteration 5522/ 6200 | consumed samples: 5654528 | consumed tokens: 11580473344 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614553E+00 | loss scale: 1024.0 | grad norm: 4.969 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.580 | TFLOPs: 42.24 | +[default7]: iteration 5523/ 6200 | consumed samples: 5655552 | consumed tokens: 11582570496 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629939E+00 | loss scale: 1024.0 | grad norm: 5.835 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.689 | TFLOPs: 42.27 | +[default7]: iteration 5524/ 6200 | consumed samples: 5656576 | consumed tokens: 11584667648 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624308E+00 | loss scale: 1024.0 | grad norm: 5.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.727 | TFLOPs: 42.29 | +[default7]: iteration 5525/ 6200 | consumed samples: 5657600 | consumed tokens: 11586764800 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637993E+00 | loss scale: 1024.0 | grad norm: 5.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.855 | TFLOPs: 42.32 | +[default7]: iteration 5526/ 6200 | consumed samples: 5658624 | consumed tokens: 11588861952 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634005E+00 | loss scale: 1024.0 | grad norm: 5.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.794 | TFLOPs: 42.31 | +[default7]: iteration 5527/ 6200 | consumed samples: 5659648 | consumed tokens: 11590959104 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640381E+00 | loss scale: 1024.0 | grad norm: 5.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.915 | TFLOPs: 42.34 | +[default7]: iteration 5528/ 6200 | consumed samples: 5660672 | consumed tokens: 11593056256 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657901E+00 | loss scale: 1024.0 | grad norm: 6.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.774 | TFLOPs: 42.30 | +[default7]: iteration 5529/ 6200 | consumed samples: 5661696 | consumed tokens: 11595153408 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624741E+00 | loss scale: 1024.0 | grad norm: 5.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.602 | TFLOPs: 42.25 | +[default7]: iteration 5530/ 6200 | consumed samples: 5662720 | consumed tokens: 11597250560 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624704E+00 | loss scale: 1024.0 | grad norm: 4.959 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.008 | TFLOPs: 42.37 | +[default7]: iteration 5531/ 6200 | consumed samples: 5663744 | consumed tokens: 11599347712 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658103E+00 | loss scale: 1024.0 | grad norm: 5.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.033 | TFLOPs: 42.38 | +[default7]: iteration 5532/ 6200 | consumed samples: 5664768 | consumed tokens: 11601444864 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643592E+00 | loss scale: 1024.0 | grad norm: 5.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.072 | TFLOPs: 42.39 | +[default7]: iteration 5533/ 6200 | consumed samples: 5665792 | consumed tokens: 11603542016 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633627E+00 | loss scale: 1024.0 | grad norm: 5.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.744 | TFLOPs: 42.29 | +[default7]: iteration 5534/ 6200 | consumed samples: 5666816 | consumed tokens: 11605639168 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636853E+00 | loss scale: 1024.0 | grad norm: 4.786 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.092 | TFLOPs: 42.40 | +[default7]: iteration 5535/ 6200 | consumed samples: 5667840 | consumed tokens: 11607736320 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663248E+00 | loss scale: 1024.0 | grad norm: 5.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.943 | TFLOPs: 42.35 | +[default7]: iteration 5536/ 6200 | consumed samples: 5668864 | consumed tokens: 11609833472 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633760E+00 | loss scale: 1024.0 | grad norm: 4.848 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.159 | TFLOPs: 42.42 | +[default7]: iteration 5537/ 6200 | consumed samples: 5669888 | consumed tokens: 11611930624 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639611E+00 | loss scale: 1024.0 | grad norm: 4.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.026 | TFLOPs: 42.38 | +[default7]: iteration 5538/ 6200 | consumed samples: 5670912 | consumed tokens: 11614027776 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657360E+00 | loss scale: 1024.0 | grad norm: 5.037 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.756 | TFLOPs: 42.29 | +[default7]: iteration 5539/ 6200 | consumed samples: 5671936 | consumed tokens: 11616124928 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633177E+00 | loss scale: 1024.0 | grad norm: 4.888 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.696 | TFLOPs: 42.28 | +[default7]: iteration 5540/ 6200 | consumed samples: 5672960 | consumed tokens: 11618222080 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658468E+00 | loss scale: 1024.0 | grad norm: 4.710 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.447 | TFLOPs: 42.20 | +[default7]: iteration 5541/ 6200 | consumed samples: 5673984 | consumed tokens: 11620319232 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651216E+00 | loss scale: 1024.0 | grad norm: 5.033 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.646 | TFLOPs: 42.26 | +[default7]: iteration 5542/ 6200 | consumed samples: 5675008 | consumed tokens: 11622416384 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649237E+00 | loss scale: 1024.0 | grad norm: 5.015 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.736 | TFLOPs: 42.29 | +[default7]: iteration 5543/ 6200 | consumed samples: 5676032 | consumed tokens: 11624513536 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639817E+00 | loss scale: 1024.0 | grad norm: 6.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.616 | TFLOPs: 42.25 | +[default7]: iteration 5544/ 6200 | consumed samples: 5677056 | consumed tokens: 11626610688 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628600E+00 | loss scale: 1024.0 | grad norm: 6.047 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.703 | TFLOPs: 42.28 | +[default7]: iteration 5545/ 6200 | consumed samples: 5678080 | consumed tokens: 11628707840 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.607260E+00 | loss scale: 1024.0 | grad norm: 6.904 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.716 | TFLOPs: 42.28 | +[default7]: iteration 5546/ 6200 | consumed samples: 5679104 | consumed tokens: 11630804992 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.604165E+00 | loss scale: 1024.0 | grad norm: 5.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.727 | TFLOPs: 42.29 | +[default7]: iteration 5547/ 6200 | consumed samples: 5680128 | consumed tokens: 11632902144 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650528E+00 | loss scale: 1024.0 | grad norm: 5.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.904 | TFLOPs: 42.34 | +[default7]: iteration 5548/ 6200 | consumed samples: 5681152 | consumed tokens: 11634999296 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636191E+00 | loss scale: 1024.0 | grad norm: 5.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.721 | TFLOPs: 42.28 | +[default7]: iteration 5549/ 6200 | consumed samples: 5682176 | consumed tokens: 11637096448 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639431E+00 | loss scale: 1024.0 | grad norm: 5.768 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.485 | TFLOPs: 42.21 | +[default7]: iteration 5550/ 6200 | consumed samples: 5683200 | consumed tokens: 11639193600 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659002E+00 | loss scale: 1024.0 | grad norm: 5.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.677 | TFLOPs: 42.27 | +[default7]: iteration 5551/ 6200 | consumed samples: 5684224 | consumed tokens: 11641290752 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656788E+00 | loss scale: 1024.0 | grad norm: 6.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.804 | TFLOPs: 42.31 | +[default7]: iteration 5552/ 6200 | consumed samples: 5685248 | consumed tokens: 11643387904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650069E+00 | loss scale: 1024.0 | grad norm: 5.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.691 | TFLOPs: 42.27 | +[default7]: iteration 5553/ 6200 | consumed samples: 5686272 | consumed tokens: 11645485056 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649837E+00 | loss scale: 1024.0 | grad norm: 4.905 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.992 | TFLOPs: 42.37 | +[default7]: iteration 5554/ 6200 | consumed samples: 5687296 | consumed tokens: 11647582208 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639362E+00 | loss scale: 1024.0 | grad norm: 5.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.155 | TFLOPs: 42.42 | +[default7]: iteration 5555/ 6200 | consumed samples: 5688320 | consumed tokens: 11649679360 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619928E+00 | loss scale: 1024.0 | grad norm: 4.976 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.977 | TFLOPs: 42.36 | +[default7]: iteration 5556/ 6200 | consumed samples: 5689344 | consumed tokens: 11651776512 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640059E+00 | loss scale: 1024.0 | grad norm: 4.915 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.855 | TFLOPs: 42.32 | +[default7]: iteration 5557/ 6200 | consumed samples: 5690368 | consumed tokens: 11653873664 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631930E+00 | loss scale: 1024.0 | grad norm: 4.882 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.102 | TFLOPs: 42.40 | +[default7]: iteration 5558/ 6200 | consumed samples: 5691392 | consumed tokens: 11655970816 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621460E+00 | loss scale: 1024.0 | grad norm: 6.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.163 | TFLOPs: 42.42 | +[default7]: iteration 5559/ 6200 | consumed samples: 5692416 | consumed tokens: 11658067968 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614781E+00 | loss scale: 1024.0 | grad norm: 6.807 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.927 | TFLOPs: 42.35 | +[default7]: iteration 5560/ 6200 | consumed samples: 5693440 | consumed tokens: 11660165120 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641453E+00 | loss scale: 1024.0 | grad norm: 6.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.914 | TFLOPs: 42.34 | +[default7]: iteration 5561/ 6200 | consumed samples: 5694464 | consumed tokens: 11662262272 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659389E+00 | loss scale: 1024.0 | grad norm: 5.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.041 | TFLOPs: 42.38 | +[default7]: iteration 5562/ 6200 | consumed samples: 5695488 | consumed tokens: 11664359424 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651641E+00 | loss scale: 1024.0 | grad norm: 5.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.273 | TFLOPs: 42.45 | +[default7]: iteration 5563/ 6200 | consumed samples: 5696512 | consumed tokens: 11666456576 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660270E+00 | loss scale: 1024.0 | grad norm: 6.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.290 | TFLOPs: 42.46 | +[default7]: iteration 5564/ 6200 | consumed samples: 5697536 | consumed tokens: 11668553728 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631358E+00 | loss scale: 1024.0 | grad norm: 5.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.120 | TFLOPs: 42.40 | +[default7]: iteration 5565/ 6200 | consumed samples: 5698560 | consumed tokens: 11670650880 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663955E+00 | loss scale: 1024.0 | grad norm: 5.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.155 | TFLOPs: 42.42 | +[default7]: iteration 5566/ 6200 | consumed samples: 5699584 | consumed tokens: 11672748032 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627056E+00 | loss scale: 1024.0 | grad norm: 5.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.065 | TFLOPs: 42.39 | +[default7]: iteration 5567/ 6200 | consumed samples: 5700608 | consumed tokens: 11674845184 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643979E+00 | loss scale: 1024.0 | grad norm: 5.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.997 | TFLOPs: 42.37 | +[default7]: iteration 5568/ 6200 | consumed samples: 5701632 | consumed tokens: 11676942336 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631545E+00 | loss scale: 1024.0 | grad norm: 5.625 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.955 | TFLOPs: 42.35 | +[default7]: iteration 5569/ 6200 | consumed samples: 5702656 | consumed tokens: 11679039488 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621865E+00 | loss scale: 1024.0 | grad norm: 5.041 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.838 | TFLOPs: 42.32 | +[default7]: iteration 5570/ 6200 | consumed samples: 5703680 | consumed tokens: 11681136640 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646869E+00 | loss scale: 1024.0 | grad norm: 6.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.058 | TFLOPs: 42.39 | +[default7]: iteration 5571/ 6200 | consumed samples: 5704704 | consumed tokens: 11683233792 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626071E+00 | loss scale: 1024.0 | grad norm: 5.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.046 | TFLOPs: 42.38 | +[default7]: iteration 5572/ 6200 | consumed samples: 5705728 | consumed tokens: 11685330944 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622950E+00 | loss scale: 1024.0 | grad norm: 5.641 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.146 | TFLOPs: 42.41 | +[default7]: iteration 5573/ 6200 | consumed samples: 5706752 | consumed tokens: 11687428096 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656060E+00 | loss scale: 1024.0 | grad norm: 6.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.107 | TFLOPs: 42.40 | +[default7]: iteration 5574/ 6200 | consumed samples: 5707776 | consumed tokens: 11689525248 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668637E+00 | loss scale: 1024.0 | grad norm: 4.687 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.085 | TFLOPs: 42.39 | +[default7]: iteration 5575/ 6200 | consumed samples: 5708800 | consumed tokens: 11691622400 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630219E+00 | loss scale: 1024.0 | grad norm: 5.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.903 | TFLOPs: 42.34 | +[default7]: iteration 5576/ 6200 | consumed samples: 5709824 | consumed tokens: 11693719552 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614097E+00 | loss scale: 1024.0 | grad norm: 6.001 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.086 | TFLOPs: 42.39 | +[default7]: iteration 5577/ 6200 | consumed samples: 5710848 | consumed tokens: 11695816704 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619513E+00 | loss scale: 1024.0 | grad norm: 4.852 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.961 | TFLOPs: 42.36 | +[default7]: iteration 5578/ 6200 | consumed samples: 5711872 | consumed tokens: 11697913856 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638828E+00 | loss scale: 1024.0 | grad norm: 6.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.076 | TFLOPs: 42.39 | +[default7]: iteration 5579/ 6200 | consumed samples: 5712896 | consumed tokens: 11700011008 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651236E+00 | loss scale: 1024.0 | grad norm: 4.867 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.175 | TFLOPs: 42.42 | +[default7]: iteration 5580/ 6200 | consumed samples: 5713920 | consumed tokens: 11702108160 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660232E+00 | loss scale: 1024.0 | grad norm: 5.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.024 | TFLOPs: 42.38 | +[default7]: iteration 5581/ 6200 | consumed samples: 5714944 | consumed tokens: 11704205312 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624322E+00 | loss scale: 1024.0 | grad norm: 4.951 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.025 | TFLOPs: 42.38 | +[default7]: iteration 5582/ 6200 | consumed samples: 5715968 | consumed tokens: 11706302464 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622649E+00 | loss scale: 1024.0 | grad norm: 5.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.816 | TFLOPs: 42.31 | +[default7]: iteration 5583/ 6200 | consumed samples: 5716992 | consumed tokens: 11708399616 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658752E+00 | loss scale: 1024.0 | grad norm: 5.071 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.335 | TFLOPs: 42.47 | +[default7]: iteration 5584/ 6200 | consumed samples: 5718016 | consumed tokens: 11710496768 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.667522E+00 | loss scale: 1024.0 | grad norm: 4.806 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.998 | TFLOPs: 42.37 | +[default7]: iteration 5585/ 6200 | consumed samples: 5719040 | consumed tokens: 11712593920 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635185E+00 | loss scale: 2048.0 | grad norm: 2.672 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.293 | TFLOPs: 42.46 | +[default7]: iteration 5586/ 6200 | consumed samples: 5720064 | consumed tokens: 11714691072 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662498E+00 | loss scale: 2048.0 | grad norm: 4.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.058 | TFLOPs: 42.39 | +[default7]: iteration 5587/ 6200 | consumed samples: 5721088 | consumed tokens: 11716788224 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.592103E+00 | loss scale: 2048.0 | grad norm: 6.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.132 | TFLOPs: 42.41 | +[default7]: iteration 5588/ 6200 | consumed samples: 5722112 | consumed tokens: 11718885376 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663394E+00 | loss scale: 2048.0 | grad norm: 6.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.052 | TFLOPs: 42.38 | +[default7]: iteration 5589/ 6200 | consumed samples: 5723136 | consumed tokens: 11720982528 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.606691E+00 | loss scale: 2048.0 | grad norm: 4.992 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.155 | TFLOPs: 42.42 | +[default7]: iteration 5590/ 6200 | consumed samples: 5724160 | consumed tokens: 11723079680 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614966E+00 | loss scale: 2048.0 | grad norm: 4.896 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.251 | TFLOPs: 42.44 | +[default7]: iteration 5591/ 6200 | consumed samples: 5725184 | consumed tokens: 11725176832 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629382E+00 | loss scale: 2048.0 | grad norm: 5.049 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.197 | TFLOPs: 42.43 | +[default7]: iteration 5592/ 6200 | consumed samples: 5726208 | consumed tokens: 11727273984 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645452E+00 | loss scale: 2048.0 | grad norm: 5.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.351 | TFLOPs: 42.48 | +[default7]: iteration 5593/ 6200 | consumed samples: 5727232 | consumed tokens: 11729371136 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.597834E+00 | loss scale: 2048.0 | grad norm: 4.802 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.135 | TFLOPs: 42.41 | +[default7]: iteration 5594/ 6200 | consumed samples: 5728256 | consumed tokens: 11731468288 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636936E+00 | loss scale: 2048.0 | grad norm: 4.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.018 | TFLOPs: 42.37 | +[default7]: iteration 5595/ 6200 | consumed samples: 5729280 | consumed tokens: 11733565440 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617544E+00 | loss scale: 2048.0 | grad norm: 5.760 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.101 | TFLOPs: 42.40 | +[default7]: iteration 5596/ 6200 | consumed samples: 5730304 | consumed tokens: 11735662592 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619221E+00 | loss scale: 2048.0 | grad norm: 4.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.217 | TFLOPs: 42.43 | +[default7]: iteration 5597/ 6200 | consumed samples: 5731328 | consumed tokens: 11737759744 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648983E+00 | loss scale: 2048.0 | grad norm: 4.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.268 | TFLOPs: 42.45 | +[default7]: iteration 5598/ 6200 | consumed samples: 5732352 | consumed tokens: 11739856896 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626544E+00 | loss scale: 2048.0 | grad norm: 4.933 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.146 | TFLOPs: 42.41 | +[default7]: iteration 5599/ 6200 | consumed samples: 5733376 | consumed tokens: 11741954048 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.616768E+00 | loss scale: 2048.0 | grad norm: 6.082 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.127 | TFLOPs: 42.41 | +[default7]: iteration 5600/ 6200 | consumed samples: 5734400 | consumed tokens: 11744051200 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638491E+00 | loss scale: 2048.0 | grad norm: 5.741 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.320 | TFLOPs: 42.47 | +[default7]: iteration 5601/ 6200 | consumed samples: 5735424 | consumed tokens: 11746148352 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638826E+00 | loss scale: 2048.0 | grad norm: 5.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.348 | TFLOPs: 42.47 | +[default7]: iteration 5602/ 6200 | consumed samples: 5736448 | consumed tokens: 11748245504 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629651E+00 | loss scale: 2048.0 | grad norm: 5.618 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.188 | TFLOPs: 42.43 | +[default7]: iteration 5603/ 6200 | consumed samples: 5737472 | consumed tokens: 11750342656 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641974E+00 | loss scale: 2048.0 | grad norm: 5.825 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.239 | TFLOPs: 42.44 | +[default7]: iteration 5604/ 6200 | consumed samples: 5738496 | consumed tokens: 11752439808 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633404E+00 | loss scale: 2048.0 | grad norm: 5.912 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.079 | TFLOPs: 42.39 | +[default7]: iteration 5605/ 6200 | consumed samples: 5739520 | consumed tokens: 11754536960 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632143E+00 | loss scale: 2048.0 | grad norm: 4.884 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.157 | TFLOPs: 42.42 | +[default7]: iteration 5606/ 6200 | consumed samples: 5740544 | consumed tokens: 11756634112 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637068E+00 | loss scale: 2048.0 | grad norm: 5.958 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.294 | TFLOPs: 42.46 | +[default7]: iteration 5607/ 6200 | consumed samples: 5741568 | consumed tokens: 11758731264 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635822E+00 | loss scale: 2048.0 | grad norm: 5.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.198 | TFLOPs: 42.43 | +[default7]: iteration 5608/ 6200 | consumed samples: 5742592 | consumed tokens: 11760828416 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619335E+00 | loss scale: 2048.0 | grad norm: 6.067 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.989 | TFLOPs: 42.36 | +[default7]: iteration 5609/ 6200 | consumed samples: 5743616 | consumed tokens: 11762925568 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641615E+00 | loss scale: 2048.0 | grad norm: 5.761 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.154 | TFLOPs: 42.42 | +[default7]: iteration 5610/ 6200 | consumed samples: 5744640 | consumed tokens: 11765022720 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617815E+00 | loss scale: 2048.0 | grad norm: 5.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.059 | TFLOPs: 42.39 | +[default7]: iteration 5611/ 6200 | consumed samples: 5745664 | consumed tokens: 11767119872 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626487E+00 | loss scale: 2048.0 | grad norm: 6.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.768 | TFLOPs: 42.30 | +[default7]: iteration 5612/ 6200 | consumed samples: 5746688 | consumed tokens: 11769217024 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.613276E+00 | loss scale: 2048.0 | grad norm: 5.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.983 | TFLOPs: 42.36 | +[default7]: iteration 5613/ 6200 | consumed samples: 5747712 | consumed tokens: 11771314176 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635994E+00 | loss scale: 2048.0 | grad norm: 5.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.987 | TFLOPs: 42.36 | +[default7]: iteration 5614/ 6200 | consumed samples: 5748736 | consumed tokens: 11773411328 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632390E+00 | loss scale: 2048.0 | grad norm: 5.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.814 | TFLOPs: 42.31 | +[default7]: iteration 5615/ 6200 | consumed samples: 5749760 | consumed tokens: 11775508480 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644280E+00 | loss scale: 2048.0 | grad norm: 5.646 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.826 | TFLOPs: 42.32 | +[default7]: iteration 5616/ 6200 | consumed samples: 5750784 | consumed tokens: 11777605632 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656593E+00 | loss scale: 2048.0 | grad norm: 5.804 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.932 | TFLOPs: 42.35 | +[default7]: iteration 5617/ 6200 | consumed samples: 5751808 | consumed tokens: 11779702784 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645750E+00 | loss scale: 2048.0 | grad norm: 5.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.977 | TFLOPs: 42.36 | +[default7]: iteration 5618/ 6200 | consumed samples: 5752832 | consumed tokens: 11781799936 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636041E+00 | loss scale: 2048.0 | grad norm: 5.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.838 | TFLOPs: 42.32 | +[default7]: iteration 5619/ 6200 | consumed samples: 5753856 | consumed tokens: 11783897088 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650742E+00 | loss scale: 2048.0 | grad norm: 6.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.056 | TFLOPs: 42.39 | +[default7]: iteration 5620/ 6200 | consumed samples: 5754880 | consumed tokens: 11785994240 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644988E+00 | loss scale: 2048.0 | grad norm: 4.849 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.669 | TFLOPs: 42.27 | +[default7]: iteration 5621/ 6200 | consumed samples: 5755904 | consumed tokens: 11788091392 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642025E+00 | loss scale: 2048.0 | grad norm: 5.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.837 | TFLOPs: 42.32 | +[default7]: iteration 5622/ 6200 | consumed samples: 5756928 | consumed tokens: 11790188544 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624654E+00 | loss scale: 2048.0 | grad norm: 5.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.899 | TFLOPs: 42.34 | +[default7]: iteration 5623/ 6200 | consumed samples: 5757952 | consumed tokens: 11792285696 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624092E+00 | loss scale: 2048.0 | grad norm: 5.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.037 | TFLOPs: 42.38 | +[default7]: iteration 5624/ 6200 | consumed samples: 5758976 | consumed tokens: 11794382848 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625131E+00 | loss scale: 2048.0 | grad norm: 7.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.810 | TFLOPs: 42.31 | +[default7]: iteration 5625/ 6200 | consumed samples: 5760000 | consumed tokens: 11796480000 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631062E+00 | loss scale: 2048.0 | grad norm: 6.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.659 | TFLOPs: 42.26 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 5625 | lm loss value: 3.695323E+00 | lm loss PPL: 4.025855E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 5625 | lm loss value: 1.530529E+00 | lm loss PPL: 4.620620E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 5626/ 6200 | consumed samples: 5761024 | consumed tokens: 11798577152 | elapsed time per iteration (s): 51.75 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639713E+00 | loss scale: 2048.0 | grad norm: 5.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.787 | TFLOPs: 6.03 | +[default7]: iteration 5627/ 6200 | consumed samples: 5762048 | consumed tokens: 11800674304 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647593E+00 | loss scale: 2048.0 | grad norm: 6.903 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.894 | TFLOPs: 42.34 | +[default7]: iteration 5628/ 6200 | consumed samples: 5763072 | consumed tokens: 11802771456 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627362E+00 | loss scale: 2048.0 | grad norm: 5.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.642 | TFLOPs: 42.26 | +[default7]: iteration 5629/ 6200 | consumed samples: 5764096 | consumed tokens: 11804868608 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635386E+00 | loss scale: 2048.0 | grad norm: 5.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.213 | TFLOPs: 42.43 | +[default7]: iteration 5630/ 6200 | consumed samples: 5765120 | consumed tokens: 11806965760 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610011E+00 | loss scale: 2048.0 | grad norm: 5.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.962 | TFLOPs: 42.36 | +[default7]: iteration 5631/ 6200 | consumed samples: 5766144 | consumed tokens: 11809062912 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628070E+00 | loss scale: 2048.0 | grad norm: 5.684 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.100 | TFLOPs: 42.40 | +[default7]: iteration 5632/ 6200 | consumed samples: 5767168 | consumed tokens: 11811160064 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637243E+00 | loss scale: 2048.0 | grad norm: 5.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.988 | TFLOPs: 42.36 | +[default7]: iteration 5633/ 6200 | consumed samples: 5768192 | consumed tokens: 11813257216 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632816E+00 | loss scale: 2048.0 | grad norm: 5.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.223 | TFLOPs: 42.44 | +[default7]: iteration 5634/ 6200 | consumed samples: 5769216 | consumed tokens: 11815354368 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630900E+00 | loss scale: 2048.0 | grad norm: 4.938 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.252 | TFLOPs: 42.45 | +[default7]: iteration 5635/ 6200 | consumed samples: 5770240 | consumed tokens: 11817451520 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621442E+00 | loss scale: 2048.0 | grad norm: 5.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.293 | TFLOPs: 42.46 | +[default7]: iteration 5636/ 6200 | consumed samples: 5771264 | consumed tokens: 11819548672 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646710E+00 | loss scale: 2048.0 | grad norm: 5.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.212 | TFLOPs: 42.43 | +[default7]: iteration 5637/ 6200 | consumed samples: 5772288 | consumed tokens: 11821645824 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631750E+00 | loss scale: 2048.0 | grad norm: 4.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.095 | TFLOPs: 42.40 | +[default7]: iteration 5638/ 6200 | consumed samples: 5773312 | consumed tokens: 11823742976 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614728E+00 | loss scale: 2048.0 | grad norm: 4.965 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.865 | TFLOPs: 42.33 | +[default7]: iteration 5639/ 6200 | consumed samples: 5774336 | consumed tokens: 11825840128 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627518E+00 | loss scale: 2048.0 | grad norm: 4.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.103 | TFLOPs: 42.40 | +[default7]: iteration 5640/ 6200 | consumed samples: 5775360 | consumed tokens: 11827937280 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615476E+00 | loss scale: 2048.0 | grad norm: 4.804 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.002 | TFLOPs: 42.37 | +[default7]: iteration 5641/ 6200 | consumed samples: 5776384 | consumed tokens: 11830034432 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.661148E+00 | loss scale: 2048.0 | grad norm: 5.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.134 | TFLOPs: 42.41 | +[default7]: iteration 5642/ 6200 | consumed samples: 5777408 | consumed tokens: 11832131584 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637880E+00 | loss scale: 2048.0 | grad norm: 5.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.993 | TFLOPs: 42.37 | +[default7]: iteration 5643/ 6200 | consumed samples: 5778432 | consumed tokens: 11834228736 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617784E+00 | loss scale: 2048.0 | grad norm: 5.020 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.676 | TFLOPs: 42.27 | +[default7]: iteration 5644/ 6200 | consumed samples: 5779456 | consumed tokens: 11836325888 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610022E+00 | loss scale: 2048.0 | grad norm: 6.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.131 | TFLOPs: 42.41 | +[default7]: iteration 5645/ 6200 | consumed samples: 5780480 | consumed tokens: 11838423040 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653883E+00 | loss scale: 2048.0 | grad norm: 7.701 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.025 | TFLOPs: 42.38 | +[default7]: iteration 5646/ 6200 | consumed samples: 5781504 | consumed tokens: 11840520192 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.602794E+00 | loss scale: 2048.0 | grad norm: 5.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.828 | TFLOPs: 42.32 | +[default7]: iteration 5647/ 6200 | consumed samples: 5782528 | consumed tokens: 11842617344 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618115E+00 | loss scale: 2048.0 | grad norm: 4.640 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.170 | TFLOPs: 42.42 | +[default7]: iteration 5648/ 6200 | consumed samples: 5783552 | consumed tokens: 11844714496 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638674E+00 | loss scale: 2048.0 | grad norm: 7.593 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.227 | TFLOPs: 42.44 | +[default7]: iteration 5649/ 6200 | consumed samples: 5784576 | consumed tokens: 11846811648 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656766E+00 | loss scale: 2048.0 | grad norm: 5.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.021 | TFLOPs: 42.37 | +[default7]: iteration 5650/ 6200 | consumed samples: 5785600 | consumed tokens: 11848908800 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636024E+00 | loss scale: 2048.0 | grad norm: 5.828 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.692 | TFLOPs: 42.27 | +[default7]: iteration 5651/ 6200 | consumed samples: 5786624 | consumed tokens: 11851005952 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654260E+00 | loss scale: 2048.0 | grad norm: 5.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.999 | TFLOPs: 42.37 | +[default7]: iteration 5652/ 6200 | consumed samples: 5787648 | consumed tokens: 11853103104 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609796E+00 | loss scale: 2048.0 | grad norm: 6.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.004 | TFLOPs: 42.37 | +[default7]: iteration 5653/ 6200 | consumed samples: 5788672 | consumed tokens: 11855200256 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618316E+00 | loss scale: 2048.0 | grad norm: 4.740 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.817 | TFLOPs: 42.31 | +[default7]: iteration 5654/ 6200 | consumed samples: 5789696 | consumed tokens: 11857297408 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617389E+00 | loss scale: 2048.0 | grad norm: 6.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.842 | TFLOPs: 42.32 | +[default7]: iteration 5655/ 6200 | consumed samples: 5790720 | consumed tokens: 11859394560 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621503E+00 | loss scale: 2048.0 | grad norm: 6.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.960 | TFLOPs: 42.36 | +[default7]: iteration 5656/ 6200 | consumed samples: 5791744 | consumed tokens: 11861491712 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632583E+00 | loss scale: 2048.0 | grad norm: 6.074 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.715 | TFLOPs: 42.28 | +[default7]: iteration 5657/ 6200 | consumed samples: 5792768 | consumed tokens: 11863588864 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.596980E+00 | loss scale: 2048.0 | grad norm: 5.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.847 | TFLOPs: 42.32 | +[default7]: iteration 5658/ 6200 | consumed samples: 5793792 | consumed tokens: 11865686016 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632668E+00 | loss scale: 2048.0 | grad norm: 5.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.742 | TFLOPs: 42.29 | +[default7]: iteration 5659/ 6200 | consumed samples: 5794816 | consumed tokens: 11867783168 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621437E+00 | loss scale: 2048.0 | grad norm: 5.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.036 | TFLOPs: 42.38 | +[default7]: iteration 5660/ 6200 | consumed samples: 5795840 | consumed tokens: 11869880320 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626300E+00 | loss scale: 2048.0 | grad norm: 7.004 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.776 | TFLOPs: 42.30 | +[default7]: iteration 5661/ 6200 | consumed samples: 5796864 | consumed tokens: 11871977472 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614041E+00 | loss scale: 2048.0 | grad norm: 4.830 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.749 | TFLOPs: 42.29 | +[default7]: iteration 5662/ 6200 | consumed samples: 5797888 | consumed tokens: 11874074624 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609638E+00 | loss scale: 2048.0 | grad norm: 5.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.831 | TFLOPs: 42.32 | +[default7]: iteration 5663/ 6200 | consumed samples: 5798912 | consumed tokens: 11876171776 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652415E+00 | loss scale: 2048.0 | grad norm: 5.614 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.847 | TFLOPs: 42.32 | +[default7]: iteration 5664/ 6200 | consumed samples: 5799936 | consumed tokens: 11878268928 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622451E+00 | loss scale: 2048.0 | grad norm: 4.555 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.051 | TFLOPs: 42.38 | +[default7]: iteration 5665/ 6200 | consumed samples: 5800960 | consumed tokens: 11880366080 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617952E+00 | loss scale: 2048.0 | grad norm: 5.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.056 | TFLOPs: 42.39 | +[default7]: iteration 5666/ 6200 | consumed samples: 5801984 | consumed tokens: 11882463232 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641883E+00 | loss scale: 2048.0 | grad norm: 4.897 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.968 | TFLOPs: 42.36 | +[default7]: iteration 5667/ 6200 | consumed samples: 5803008 | consumed tokens: 11884560384 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633630E+00 | loss scale: 2048.0 | grad norm: 5.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.129 | TFLOPs: 42.41 | +[default7]: iteration 5668/ 6200 | consumed samples: 5804032 | consumed tokens: 11886657536 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621254E+00 | loss scale: 2048.0 | grad norm: 5.043 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.913 | TFLOPs: 42.34 | +[default7]: iteration 5669/ 6200 | consumed samples: 5805056 | consumed tokens: 11888754688 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624049E+00 | loss scale: 2048.0 | grad norm: 5.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.995 | TFLOPs: 42.37 | +[default7]: iteration 5670/ 6200 | consumed samples: 5806080 | consumed tokens: 11890851840 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618842E+00 | loss scale: 2048.0 | grad norm: 4.944 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.946 | TFLOPs: 42.35 | +[default7]: iteration 5671/ 6200 | consumed samples: 5807104 | consumed tokens: 11892948992 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625852E+00 | loss scale: 2048.0 | grad norm: 5.781 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.108 | TFLOPs: 42.40 | +[default7]: iteration 5672/ 6200 | consumed samples: 5808128 | consumed tokens: 11895046144 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625197E+00 | loss scale: 2048.0 | grad norm: 4.999 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.109 | TFLOPs: 42.40 | +[default7]: iteration 5673/ 6200 | consumed samples: 5809152 | consumed tokens: 11897143296 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622627E+00 | loss scale: 2048.0 | grad norm: 5.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.088 | TFLOPs: 42.40 | +[default7]: iteration 5674/ 6200 | consumed samples: 5810176 | consumed tokens: 11899240448 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615133E+00 | loss scale: 2048.0 | grad norm: 5.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.032 | TFLOPs: 42.38 | +[default7]: iteration 5675/ 6200 | consumed samples: 5811200 | consumed tokens: 11901337600 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609957E+00 | loss scale: 2048.0 | grad norm: 5.696 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.011 | TFLOPs: 42.37 | +[default7]: iteration 5676/ 6200 | consumed samples: 5812224 | consumed tokens: 11903434752 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640611E+00 | loss scale: 2048.0 | grad norm: 6.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.033 | TFLOPs: 42.38 | +[default7]: iteration 5677/ 6200 | consumed samples: 5813248 | consumed tokens: 11905531904 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637452E+00 | loss scale: 2048.0 | grad norm: 6.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.987 | TFLOPs: 42.36 | +[default7]: iteration 5678/ 6200 | consumed samples: 5814272 | consumed tokens: 11907629056 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626117E+00 | loss scale: 2048.0 | grad norm: 5.657 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.990 | TFLOPs: 42.37 | +[default7]: iteration 5679/ 6200 | consumed samples: 5815296 | consumed tokens: 11909726208 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621980E+00 | loss scale: 2048.0 | grad norm: 4.993 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.878 | TFLOPs: 42.33 | +[default7]: iteration 5680/ 6200 | consumed samples: 5816320 | consumed tokens: 11911823360 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654590E+00 | loss scale: 2048.0 | grad norm: 5.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.654 | TFLOPs: 42.26 | +[default7]: iteration 5681/ 6200 | consumed samples: 5817344 | consumed tokens: 11913920512 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623060E+00 | loss scale: 2048.0 | grad norm: 5.079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.193 | TFLOPs: 42.43 | +[default7]: iteration 5682/ 6200 | consumed samples: 5818368 | consumed tokens: 11916017664 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621263E+00 | loss scale: 2048.0 | grad norm: 6.896 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.714 | TFLOPs: 41.98 | +[default7]: iteration 5683/ 6200 | consumed samples: 5819392 | consumed tokens: 11918114816 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638827E+00 | loss scale: 2048.0 | grad norm: 6.952 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.151 | TFLOPs: 42.41 | +[default7]: iteration 5684/ 6200 | consumed samples: 5820416 | consumed tokens: 11920211968 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614215E+00 | loss scale: 2048.0 | grad norm: 5.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.969 | TFLOPs: 42.36 | +[default7]: iteration 5685/ 6200 | consumed samples: 5821440 | consumed tokens: 11922309120 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653204E+00 | loss scale: 2048.0 | grad norm: 5.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.120 | TFLOPs: 42.40 | +[default7]: iteration 5686/ 6200 | consumed samples: 5822464 | consumed tokens: 11924406272 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624380E+00 | loss scale: 2048.0 | grad norm: 5.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.972 | TFLOPs: 42.36 | +[default7]: iteration 5687/ 6200 | consumed samples: 5823488 | consumed tokens: 11926503424 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623007E+00 | loss scale: 2048.0 | grad norm: 5.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.184 | TFLOPs: 42.42 | +[default7]: iteration 5688/ 6200 | consumed samples: 5824512 | consumed tokens: 11928600576 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624329E+00 | loss scale: 2048.0 | grad norm: 4.935 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.103 | TFLOPs: 42.40 | +[default7]: iteration 5689/ 6200 | consumed samples: 5825536 | consumed tokens: 11930697728 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646898E+00 | loss scale: 2048.0 | grad norm: 5.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.296 | TFLOPs: 42.46 | +[default7]: iteration 5690/ 6200 | consumed samples: 5826560 | consumed tokens: 11932794880 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646987E+00 | loss scale: 2048.0 | grad norm: 5.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.022 | TFLOPs: 42.38 | +[default7]: iteration 5691/ 6200 | consumed samples: 5827584 | consumed tokens: 11934892032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622835E+00 | loss scale: 2048.0 | grad norm: 4.894 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.035 | TFLOPs: 42.38 | +[default7]: iteration 5692/ 6200 | consumed samples: 5828608 | consumed tokens: 11936989184 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618844E+00 | loss scale: 2048.0 | grad norm: 5.952 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.930 | TFLOPs: 42.35 | +[default7]: iteration 5693/ 6200 | consumed samples: 5829632 | consumed tokens: 11939086336 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627203E+00 | loss scale: 2048.0 | grad norm: 5.027 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.004 | TFLOPs: 42.37 | +[default7]: iteration 5694/ 6200 | consumed samples: 5830656 | consumed tokens: 11941183488 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624802E+00 | loss scale: 2048.0 | grad norm: 5.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.870 | TFLOPs: 42.33 | +[default7]: iteration 5695/ 6200 | consumed samples: 5831680 | consumed tokens: 11943280640 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635312E+00 | loss scale: 2048.0 | grad norm: 4.889 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.151 | TFLOPs: 42.41 | +[default7]: iteration 5696/ 6200 | consumed samples: 5832704 | consumed tokens: 11945377792 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624829E+00 | loss scale: 2048.0 | grad norm: 5.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.060 | TFLOPs: 42.39 | +[default7]: iteration 5697/ 6200 | consumed samples: 5833728 | consumed tokens: 11947474944 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633854E+00 | loss scale: 2048.0 | grad norm: 6.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.162 | TFLOPs: 42.42 | +[default7]: iteration 5698/ 6200 | consumed samples: 5834752 | consumed tokens: 11949572096 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617633E+00 | loss scale: 2048.0 | grad norm: 4.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.057 | TFLOPs: 42.39 | +[default7]: iteration 5699/ 6200 | consumed samples: 5835776 | consumed tokens: 11951669248 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.598191E+00 | loss scale: 2048.0 | grad norm: 5.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.081 | TFLOPs: 42.39 | +[default7]: iteration 5700/ 6200 | consumed samples: 5836800 | consumed tokens: 11953766400 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637897E+00 | loss scale: 2048.0 | grad norm: 6.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.933 | TFLOPs: 42.35 | +[default7]: iteration 5701/ 6200 | consumed samples: 5837824 | consumed tokens: 11955863552 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633969E+00 | loss scale: 2048.0 | grad norm: 5.559 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.136 | TFLOPs: 42.41 | +[default7]: iteration 5702/ 6200 | consumed samples: 5838848 | consumed tokens: 11957960704 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628236E+00 | loss scale: 2048.0 | grad norm: 5.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.152 | TFLOPs: 42.41 | +[default7]: iteration 5703/ 6200 | consumed samples: 5839872 | consumed tokens: 11960057856 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612799E+00 | loss scale: 2048.0 | grad norm: 6.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.073 | TFLOPs: 42.39 | +[default7]: iteration 5704/ 6200 | consumed samples: 5840896 | consumed tokens: 11962155008 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639212E+00 | loss scale: 2048.0 | grad norm: 5.573 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.024 | TFLOPs: 42.38 | +[default7]: iteration 5705/ 6200 | consumed samples: 5841920 | consumed tokens: 11964252160 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617360E+00 | loss scale: 2048.0 | grad norm: 5.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.983 | TFLOPs: 42.36 | +[default7]: iteration 5706/ 6200 | consumed samples: 5842944 | consumed tokens: 11966349312 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618081E+00 | loss scale: 2048.0 | grad norm: 5.010 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.766 | TFLOPs: 42.30 | +[default7]: iteration 5707/ 6200 | consumed samples: 5843968 | consumed tokens: 11968446464 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622155E+00 | loss scale: 2048.0 | grad norm: 7.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.014 | TFLOPs: 42.37 | +[default7]: iteration 5708/ 6200 | consumed samples: 5844992 | consumed tokens: 11970543616 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629567E+00 | loss scale: 2048.0 | grad norm: 5.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.080 | TFLOPs: 42.39 | +[default7]: iteration 5709/ 6200 | consumed samples: 5846016 | consumed tokens: 11972640768 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632424E+00 | loss scale: 2048.0 | grad norm: 4.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.984 | TFLOPs: 42.36 | +[default7]: iteration 5710/ 6200 | consumed samples: 5847040 | consumed tokens: 11974737920 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621844E+00 | loss scale: 2048.0 | grad norm: 5.907 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.021 | TFLOPs: 42.37 | +[default7]: iteration 5711/ 6200 | consumed samples: 5848064 | consumed tokens: 11976835072 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640966E+00 | loss scale: 2048.0 | grad norm: 5.753 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.041 | TFLOPs: 42.38 | +[default7]: iteration 5712/ 6200 | consumed samples: 5849088 | consumed tokens: 11978932224 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.594825E+00 | loss scale: 2048.0 | grad norm: 5.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.821 | TFLOPs: 42.31 | +[default7]: iteration 5713/ 6200 | consumed samples: 5850112 | consumed tokens: 11981029376 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630147E+00 | loss scale: 2048.0 | grad norm: 4.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.912 | TFLOPs: 42.34 | +[default7]: iteration 5714/ 6200 | consumed samples: 5851136 | consumed tokens: 11983126528 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.606397E+00 | loss scale: 2048.0 | grad norm: 5.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.091 | TFLOPs: 42.40 | +[default7]: iteration 5715/ 6200 | consumed samples: 5852160 | consumed tokens: 11985223680 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627369E+00 | loss scale: 2048.0 | grad norm: 5.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.141 | TFLOPs: 42.41 | +[default7]: iteration 5716/ 6200 | consumed samples: 5853184 | consumed tokens: 11987320832 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635703E+00 | loss scale: 2048.0 | grad norm: 4.702 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.080 | TFLOPs: 42.39 | +[default7]: iteration 5717/ 6200 | consumed samples: 5854208 | consumed tokens: 11989417984 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620014E+00 | loss scale: 2048.0 | grad norm: 5.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.002 | TFLOPs: 42.37 | +[default7]: iteration 5718/ 6200 | consumed samples: 5855232 | consumed tokens: 11991515136 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.599299E+00 | loss scale: 2048.0 | grad norm: 4.801 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.369 | TFLOPs: 42.18 | +[default7]: iteration 5719/ 6200 | consumed samples: 5856256 | consumed tokens: 11993612288 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619204E+00 | loss scale: 2048.0 | grad norm: 5.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.807 | TFLOPs: 42.31 | +[default7]: iteration 5720/ 6200 | consumed samples: 5857280 | consumed tokens: 11995709440 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631248E+00 | loss scale: 2048.0 | grad norm: 5.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.076 | TFLOPs: 42.39 | +[default7]: iteration 5721/ 6200 | consumed samples: 5858304 | consumed tokens: 11997806592 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633610E+00 | loss scale: 2048.0 | grad norm: 5.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.156 | TFLOPs: 42.42 | +[default7]: iteration 5722/ 6200 | consumed samples: 5859328 | consumed tokens: 11999903744 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645218E+00 | loss scale: 2048.0 | grad norm: 4.988 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.005 | TFLOPs: 42.37 | +[default7]: iteration 5723/ 6200 | consumed samples: 5860352 | consumed tokens: 12002000896 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.658884E+00 | loss scale: 2048.0 | grad norm: 5.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.883 | TFLOPs: 42.33 | +[default7]: iteration 5724/ 6200 | consumed samples: 5861376 | consumed tokens: 12004098048 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612169E+00 | loss scale: 2048.0 | grad norm: 5.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.006 | TFLOPs: 42.37 | +[default7]: iteration 5725/ 6200 | consumed samples: 5862400 | consumed tokens: 12006195200 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.605983E+00 | loss scale: 2048.0 | grad norm: 5.786 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.033 | TFLOPs: 42.38 | +[default7]: iteration 5726/ 6200 | consumed samples: 5863424 | consumed tokens: 12008292352 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647870E+00 | loss scale: 2048.0 | grad norm: 4.912 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.933 | TFLOPs: 42.35 | +[default7]: iteration 5727/ 6200 | consumed samples: 5864448 | consumed tokens: 12010389504 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623654E+00 | loss scale: 2048.0 | grad norm: 5.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.053 | TFLOPs: 42.38 | +[default7]: iteration 5728/ 6200 | consumed samples: 5865472 | consumed tokens: 12012486656 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645959E+00 | loss scale: 2048.0 | grad norm: 5.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.049 | TFLOPs: 42.38 | +[default7]: iteration 5729/ 6200 | consumed samples: 5866496 | consumed tokens: 12014583808 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629478E+00 | loss scale: 2048.0 | grad norm: 7.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.031 | TFLOPs: 42.38 | +[default7]: iteration 5730/ 6200 | consumed samples: 5867520 | consumed tokens: 12016680960 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634666E+00 | loss scale: 2048.0 | grad norm: 6.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.012 | TFLOPs: 42.37 | +[default7]: iteration 5731/ 6200 | consumed samples: 5868544 | consumed tokens: 12018778112 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646133E+00 | loss scale: 2048.0 | grad norm: 5.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.833 | TFLOPs: 42.32 | +[default7]: iteration 5732/ 6200 | consumed samples: 5869568 | consumed tokens: 12020875264 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653486E+00 | loss scale: 2048.0 | grad norm: 4.705 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.958 | TFLOPs: 42.36 | +[default7]: iteration 5733/ 6200 | consumed samples: 5870592 | consumed tokens: 12022972416 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617947E+00 | loss scale: 2048.0 | grad norm: 5.015 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.005 | TFLOPs: 42.37 | +[default7]: iteration 5734/ 6200 | consumed samples: 5871616 | consumed tokens: 12025069568 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646972E+00 | loss scale: 2048.0 | grad norm: 5.621 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.055 | TFLOPs: 42.39 | +[default7]: iteration 5735/ 6200 | consumed samples: 5872640 | consumed tokens: 12027166720 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630389E+00 | loss scale: 2048.0 | grad norm: 5.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.175 | TFLOPs: 42.42 | +[default7]: iteration 5736/ 6200 | consumed samples: 5873664 | consumed tokens: 12029263872 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638384E+00 | loss scale: 2048.0 | grad norm: 4.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.684 | TFLOPs: 42.27 | +[default7]: iteration 5737/ 6200 | consumed samples: 5874688 | consumed tokens: 12031361024 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614980E+00 | loss scale: 2048.0 | grad norm: 5.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.860 | TFLOPs: 42.33 | +[default7]: iteration 5738/ 6200 | consumed samples: 5875712 | consumed tokens: 12033458176 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.599679E+00 | loss scale: 2048.0 | grad norm: 5.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.591 | TFLOPs: 42.24 | +[default7]: iteration 5739/ 6200 | consumed samples: 5876736 | consumed tokens: 12035555328 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627072E+00 | loss scale: 2048.0 | grad norm: 5.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.659 | TFLOPs: 42.26 | +[default7]: iteration 5740/ 6200 | consumed samples: 5877760 | consumed tokens: 12037652480 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649836E+00 | loss scale: 2048.0 | grad norm: 5.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.135 | TFLOPs: 42.41 | +[default7]: iteration 5741/ 6200 | consumed samples: 5878784 | consumed tokens: 12039749632 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614793E+00 | loss scale: 2048.0 | grad norm: 5.023 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.128 | TFLOPs: 42.41 | +[default7]: iteration 5742/ 6200 | consumed samples: 5879808 | consumed tokens: 12041846784 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649998E+00 | loss scale: 2048.0 | grad norm: 6.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.789 | TFLOPs: 42.30 | +[default7]: iteration 5743/ 6200 | consumed samples: 5880832 | consumed tokens: 12043943936 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612356E+00 | loss scale: 2048.0 | grad norm: 5.009 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.134 | TFLOPs: 42.41 | +[default7]: iteration 5744/ 6200 | consumed samples: 5881856 | consumed tokens: 12046041088 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618208E+00 | loss scale: 2048.0 | grad norm: 5.069 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.051 | TFLOPs: 42.38 | +[default7]: iteration 5745/ 6200 | consumed samples: 5882880 | consumed tokens: 12048138240 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642666E+00 | loss scale: 2048.0 | grad norm: 5.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.050 | TFLOPs: 42.38 | +[default7]: iteration 5746/ 6200 | consumed samples: 5883904 | consumed tokens: 12050235392 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.603004E+00 | loss scale: 2048.0 | grad norm: 5.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.139 | TFLOPs: 42.41 | +[default7]: iteration 5747/ 6200 | consumed samples: 5884928 | consumed tokens: 12052332544 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610945E+00 | loss scale: 2048.0 | grad norm: 4.768 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.121 | TFLOPs: 42.41 | +[default7]: iteration 5748/ 6200 | consumed samples: 5885952 | consumed tokens: 12054429696 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.592688E+00 | loss scale: 2048.0 | grad norm: 4.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.857 | TFLOPs: 42.32 | +[default7]: iteration 5749/ 6200 | consumed samples: 5886976 | consumed tokens: 12056526848 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632984E+00 | loss scale: 2048.0 | grad norm: 6.710 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.095 | TFLOPs: 42.40 | +[default7]: iteration 5750/ 6200 | consumed samples: 5888000 | consumed tokens: 12058624000 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.603756E+00 | loss scale: 2048.0 | grad norm: 5.802 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.124 | TFLOPs: 42.41 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 5750 | lm loss value: 3.671294E+00 | lm loss PPL: 3.930273E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 5750 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-07 01:05:53,534] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5750 is begin to save! +[default0]:[2022-10-07 01:05:53,540] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_01-model_00-model_states.pt... +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 5750 | lm loss value: 1.517972E+00 | lm loss PPL: 4.562963E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:[2022-10-07 01:05:53,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:53,907] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:53,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:53,938] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:53,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:53,966] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:53,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:53,993] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,020] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,048] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,076] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,104] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,132] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,245] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,273] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,300] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,355] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,471] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,498] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,525] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 01:05:54,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,579] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_28-model_00-model_states.pt... +[default1]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 01:05:54,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 01:05:54,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 01:05:54,581] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/mp_rank_00_model_states.pt +[default0]:[2022-10-07 01:05:54,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 01:05:54,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 01:05:54,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 01:05:54,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 01:05:54,807] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 01:05:54,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 01:05:54,833] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 01:05:54,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 01:05:54,861] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 01:05:54,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 01:05:54,877] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 01:05:54,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 01:05:54,844] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 01:05:54,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 01:05:54,912] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 01:05:54,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 01:05:54,897] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 01:05:54,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 01:05:54,875] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 01:05:54,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 01:05:54,914] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 01:05:54,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 01:05:54,921] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 01:05:54,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 01:05:54,885] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 01:05:54,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 01:05:54,900] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 01:05:54,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 01:05:54,911] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 01:05:54,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 01:05:54,933] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 01:05:54,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 01:05:54,912] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 01:05:54,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 01:05:54,877] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 01:05:54,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 01:05:54,896] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 01:05:54,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 01:05:54,910] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 01:05:54,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 01:05:54,926] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 01:05:54,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 01:05:54,899] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 01:05:54,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 01:05:54,924] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 01:05:54,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 01:05:54,932] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 01:05:54,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 01:05:54,908] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 01:05:54,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 01:05:54,905] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 01:05:54,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 01:05:54,901] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 01:05:54,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 01:05:54,934] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 01:05:54,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 01:05:54,916] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 01:05:55,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 01:05:55,002] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default2]:[2022-10-07 01:05:54,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 01:05:54,958] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default1]:[2022-10-07 01:05:55,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 01:05:55,026] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default7]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default5]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default6]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default4]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default7]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default0]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default3]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default0]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default0]: successfully saved checkpoint at iteration 5750 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default4]:[2022-10-07 01:05:55,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 01:05:55,011] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default5]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default3]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default6]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default4]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default1]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default1]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default1]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default0]:[2022-10-07 01:05:55,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 01:05:55,033] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step5750/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default7]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default7]:time (ms) | save-checkpoint: 1500.90 +[default5]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default6]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default2]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default2]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default3]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default4]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default7]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default6]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default2]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default5]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default3]:[2022-10-07 01:05:55,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5750 is ready now! +[default7]: iteration 5751/ 6200 | consumed samples: 5889024 | consumed tokens: 12060721152 | elapsed time per iteration (s): 53.58 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632280E+00 | loss scale: 2048.0 | grad norm: 5.058 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.113 | TFLOPs: 5.83 | +[default7]: iteration 5752/ 6200 | consumed samples: 5890048 | consumed tokens: 12062818304 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623605E+00 | loss scale: 2048.0 | grad norm: 5.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.555 | TFLOPs: 42.23 | +[default7]: iteration 5753/ 6200 | consumed samples: 5891072 | consumed tokens: 12064915456 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615803E+00 | loss scale: 2048.0 | grad norm: 4.902 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.929 | TFLOPs: 42.35 | +[default7]: iteration 5754/ 6200 | consumed samples: 5892096 | consumed tokens: 12067012608 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610123E+00 | loss scale: 2048.0 | grad norm: 5.614 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.145 | TFLOPs: 42.41 | +[default7]: iteration 5755/ 6200 | consumed samples: 5893120 | consumed tokens: 12069109760 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632486E+00 | loss scale: 2048.0 | grad norm: 4.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.298 | TFLOPs: 42.46 | +[default7]: iteration 5756/ 6200 | consumed samples: 5894144 | consumed tokens: 12071206912 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620871E+00 | loss scale: 2048.0 | grad norm: 4.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.025 | TFLOPs: 42.38 | +[default7]: iteration 5757/ 6200 | consumed samples: 5895168 | consumed tokens: 12073304064 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.647326E+00 | loss scale: 2048.0 | grad norm: 5.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.125 | TFLOPs: 42.41 | +[default7]: iteration 5758/ 6200 | consumed samples: 5896192 | consumed tokens: 12075401216 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.662836E+00 | loss scale: 2048.0 | grad norm: 5.643 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.105 | TFLOPs: 42.40 | +[default7]: iteration 5759/ 6200 | consumed samples: 5897216 | consumed tokens: 12077498368 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611744E+00 | loss scale: 2048.0 | grad norm: 4.658 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.012 | TFLOPs: 42.37 | +[default7]: iteration 5760/ 6200 | consumed samples: 5898240 | consumed tokens: 12079595520 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.584882E+00 | loss scale: 2048.0 | grad norm: 4.764 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.152 | TFLOPs: 42.41 | +[default7]: iteration 5761/ 6200 | consumed samples: 5899264 | consumed tokens: 12081692672 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642742E+00 | loss scale: 2048.0 | grad norm: 5.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.933 | TFLOPs: 42.35 | +[default7]: iteration 5762/ 6200 | consumed samples: 5900288 | consumed tokens: 12083789824 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632571E+00 | loss scale: 2048.0 | grad norm: 4.936 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.223 | TFLOPs: 42.44 | +[default7]: iteration 5763/ 6200 | consumed samples: 5901312 | consumed tokens: 12085886976 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638242E+00 | loss scale: 2048.0 | grad norm: 4.640 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.154 | TFLOPs: 42.42 | +[default7]: iteration 5764/ 6200 | consumed samples: 5902336 | consumed tokens: 12087984128 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641263E+00 | loss scale: 2048.0 | grad norm: 4.816 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.160 | TFLOPs: 42.42 | +[default7]: iteration 5765/ 6200 | consumed samples: 5903360 | consumed tokens: 12090081280 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611394E+00 | loss scale: 2048.0 | grad norm: 6.071 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.950 | TFLOPs: 42.35 | +[default7]: iteration 5766/ 6200 | consumed samples: 5904384 | consumed tokens: 12092178432 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620061E+00 | loss scale: 2048.0 | grad norm: 6.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.244 | TFLOPs: 42.44 | +[default7]: iteration 5767/ 6200 | consumed samples: 5905408 | consumed tokens: 12094275584 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632974E+00 | loss scale: 2048.0 | grad norm: 4.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.172 | TFLOPs: 42.42 | +[default7]: iteration 5768/ 6200 | consumed samples: 5906432 | consumed tokens: 12096372736 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.668340E+00 | loss scale: 2048.0 | grad norm: 5.851 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.305 | TFLOPs: 42.46 | +[default7]: iteration 5769/ 6200 | consumed samples: 5907456 | consumed tokens: 12098469888 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629183E+00 | loss scale: 2048.0 | grad norm: 5.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.396 | TFLOPs: 42.18 | +[default7]: iteration 5770/ 6200 | consumed samples: 5908480 | consumed tokens: 12100567040 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612588E+00 | loss scale: 2048.0 | grad norm: 5.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.253 | TFLOPs: 42.45 | +[default7]: iteration 5771/ 6200 | consumed samples: 5909504 | consumed tokens: 12102664192 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623477E+00 | loss scale: 2048.0 | grad norm: 4.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.068 | TFLOPs: 42.39 | +[default7]: iteration 5772/ 6200 | consumed samples: 5910528 | consumed tokens: 12104761344 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638882E+00 | loss scale: 2048.0 | grad norm: 4.890 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.004 | TFLOPs: 42.37 | +[default7]: iteration 5773/ 6200 | consumed samples: 5911552 | consumed tokens: 12106858496 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649800E+00 | loss scale: 2048.0 | grad norm: 4.967 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.024 | TFLOPs: 42.38 | +[default7]: iteration 5774/ 6200 | consumed samples: 5912576 | consumed tokens: 12108955648 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.603393E+00 | loss scale: 2048.0 | grad norm: 4.977 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.975 | TFLOPs: 42.36 | +[default7]: iteration 5775/ 6200 | consumed samples: 5913600 | consumed tokens: 12111052800 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626707E+00 | loss scale: 2048.0 | grad norm: 4.954 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.953 | TFLOPs: 42.35 | +[default7]: iteration 5776/ 6200 | consumed samples: 5914624 | consumed tokens: 12113149952 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615001E+00 | loss scale: 2048.0 | grad norm: 5.963 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.107 | TFLOPs: 42.40 | +[default7]: iteration 5777/ 6200 | consumed samples: 5915648 | consumed tokens: 12115247104 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626634E+00 | loss scale: 2048.0 | grad norm: 6.064 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.030 | TFLOPs: 42.38 | +[default7]: iteration 5778/ 6200 | consumed samples: 5916672 | consumed tokens: 12117344256 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636762E+00 | loss scale: 2048.0 | grad norm: 5.687 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.182 | TFLOPs: 42.42 | +[default7]: iteration 5779/ 6200 | consumed samples: 5917696 | consumed tokens: 12119441408 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628603E+00 | loss scale: 2048.0 | grad norm: 4.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.087 | TFLOPs: 42.39 | +[default7]: iteration 5780/ 6200 | consumed samples: 5918720 | consumed tokens: 12121538560 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.607216E+00 | loss scale: 2048.0 | grad norm: 6.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.042 | TFLOPs: 42.38 | +[default7]: iteration 5781/ 6200 | consumed samples: 5919744 | consumed tokens: 12123635712 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622311E+00 | loss scale: 2048.0 | grad norm: 5.781 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.115 | TFLOPs: 42.40 | +[default7]: iteration 5782/ 6200 | consumed samples: 5920768 | consumed tokens: 12125732864 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611610E+00 | loss scale: 2048.0 | grad norm: 5.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.055 | TFLOPs: 42.39 | +[default7]: iteration 5783/ 6200 | consumed samples: 5921792 | consumed tokens: 12127830016 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638579E+00 | loss scale: 2048.0 | grad norm: 5.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.957 | TFLOPs: 42.36 | +[default7]: iteration 5784/ 6200 | consumed samples: 5922816 | consumed tokens: 12129927168 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621884E+00 | loss scale: 2048.0 | grad norm: 5.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.135 | TFLOPs: 42.41 | +[default7]: iteration 5785/ 6200 | consumed samples: 5923840 | consumed tokens: 12132024320 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619911E+00 | loss scale: 2048.0 | grad norm: 6.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.106 | TFLOPs: 42.40 | +[default7]: iteration 5786/ 6200 | consumed samples: 5924864 | consumed tokens: 12134121472 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626280E+00 | loss scale: 2048.0 | grad norm: 5.016 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.897 | TFLOPs: 42.34 | +[default7]: iteration 5787/ 6200 | consumed samples: 5925888 | consumed tokens: 12136218624 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630323E+00 | loss scale: 2048.0 | grad norm: 5.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.000 | TFLOPs: 42.37 | +[default7]: iteration 5788/ 6200 | consumed samples: 5926912 | consumed tokens: 12138315776 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609753E+00 | loss scale: 2048.0 | grad norm: 7.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.159 | TFLOPs: 42.42 | +[default7]: iteration 5789/ 6200 | consumed samples: 5927936 | consumed tokens: 12140412928 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629745E+00 | loss scale: 2048.0 | grad norm: 4.920 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.829 | TFLOPs: 42.32 | +[default7]: iteration 5790/ 6200 | consumed samples: 5928960 | consumed tokens: 12142510080 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629830E+00 | loss scale: 2048.0 | grad norm: 4.914 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.942 | TFLOPs: 42.35 | +[default7]: iteration 5791/ 6200 | consumed samples: 5929984 | consumed tokens: 12144607232 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610247E+00 | loss scale: 2048.0 | grad norm: 5.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.084 | TFLOPs: 42.39 | +[default7]: iteration 5792/ 6200 | consumed samples: 5931008 | consumed tokens: 12146704384 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.602850E+00 | loss scale: 2048.0 | grad norm: 5.022 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.053 | TFLOPs: 42.38 | +[default7]: iteration 5793/ 6200 | consumed samples: 5932032 | consumed tokens: 12148801536 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630446E+00 | loss scale: 2048.0 | grad norm: 5.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.060 | TFLOPs: 42.39 | +[default7]: iteration 5794/ 6200 | consumed samples: 5933056 | consumed tokens: 12150898688 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644596E+00 | loss scale: 2048.0 | grad norm: 4.945 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.126 | TFLOPs: 42.41 | +[default7]: iteration 5795/ 6200 | consumed samples: 5934080 | consumed tokens: 12152995840 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631545E+00 | loss scale: 2048.0 | grad norm: 4.734 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.012 | TFLOPs: 42.37 | +[default7]: iteration 5796/ 6200 | consumed samples: 5935104 | consumed tokens: 12155092992 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626702E+00 | loss scale: 2048.0 | grad norm: 5.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.048 | TFLOPs: 42.38 | +[default7]: iteration 5797/ 6200 | consumed samples: 5936128 | consumed tokens: 12157190144 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627364E+00 | loss scale: 2048.0 | grad norm: 5.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.056 | TFLOPs: 42.39 | +[default7]: iteration 5798/ 6200 | consumed samples: 5937152 | consumed tokens: 12159287296 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.606055E+00 | loss scale: 2048.0 | grad norm: 5.069 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.940 | TFLOPs: 42.35 | +[default7]: iteration 5799/ 6200 | consumed samples: 5938176 | consumed tokens: 12161384448 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618222E+00 | loss scale: 2048.0 | grad norm: 4.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.932 | TFLOPs: 42.35 | +[default7]: iteration 5800/ 6200 | consumed samples: 5939200 | consumed tokens: 12163481600 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612801E+00 | loss scale: 2048.0 | grad norm: 4.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.852 | TFLOPs: 42.32 | +[default7]: iteration 5801/ 6200 | consumed samples: 5940224 | consumed tokens: 12165578752 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634785E+00 | loss scale: 2048.0 | grad norm: 5.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.871 | TFLOPs: 42.33 | +[default7]: iteration 5802/ 6200 | consumed samples: 5941248 | consumed tokens: 12167675904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612082E+00 | loss scale: 2048.0 | grad norm: 6.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.761 | TFLOPs: 42.30 | +[default7]: iteration 5803/ 6200 | consumed samples: 5942272 | consumed tokens: 12169773056 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626545E+00 | loss scale: 2048.0 | grad norm: 5.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.078 | TFLOPs: 42.39 | +[default7]: iteration 5804/ 6200 | consumed samples: 5943296 | consumed tokens: 12171870208 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.616557E+00 | loss scale: 2048.0 | grad norm: 6.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.921 | TFLOPs: 42.34 | +[default7]: iteration 5805/ 6200 | consumed samples: 5944320 | consumed tokens: 12173967360 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628532E+00 | loss scale: 2048.0 | grad norm: 5.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.004 | TFLOPs: 42.37 | +[default7]: iteration 5806/ 6200 | consumed samples: 5945344 | consumed tokens: 12176064512 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614053E+00 | loss scale: 2048.0 | grad norm: 5.065 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.080 | TFLOPs: 42.39 | +[default7]: iteration 5807/ 6200 | consumed samples: 5946368 | consumed tokens: 12178161664 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.653111E+00 | loss scale: 2048.0 | grad norm: 6.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.930 | TFLOPs: 42.35 | +[default7]: iteration 5808/ 6200 | consumed samples: 5947392 | consumed tokens: 12180258816 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.616940E+00 | loss scale: 2048.0 | grad norm: 5.064 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.058 | TFLOPs: 42.39 | +[default7]: iteration 5809/ 6200 | consumed samples: 5948416 | consumed tokens: 12182355968 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610433E+00 | loss scale: 2048.0 | grad norm: 5.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.098 | TFLOPs: 42.40 | +[default7]: iteration 5810/ 6200 | consumed samples: 5949440 | consumed tokens: 12184453120 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618615E+00 | loss scale: 2048.0 | grad norm: 4.924 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.141 | TFLOPs: 42.41 | +[default7]: iteration 5811/ 6200 | consumed samples: 5950464 | consumed tokens: 12186550272 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.660259E+00 | loss scale: 2048.0 | grad norm: 5.754 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.144 | TFLOPs: 42.41 | +[default7]: iteration 5812/ 6200 | consumed samples: 5951488 | consumed tokens: 12188647424 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631755E+00 | loss scale: 2048.0 | grad norm: 5.826 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.003 | TFLOPs: 42.37 | +[default7]: iteration 5813/ 6200 | consumed samples: 5952512 | consumed tokens: 12190744576 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632376E+00 | loss scale: 2048.0 | grad norm: 4.735 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.810 | TFLOPs: 42.31 | +[default7]: iteration 5814/ 6200 | consumed samples: 5953536 | consumed tokens: 12192841728 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619625E+00 | loss scale: 2048.0 | grad norm: 4.994 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.058 | TFLOPs: 42.39 | +[default7]: iteration 5815/ 6200 | consumed samples: 5954560 | consumed tokens: 12194938880 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643188E+00 | loss scale: 2048.0 | grad norm: 5.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.037 | TFLOPs: 42.38 | +[default7]: iteration 5816/ 6200 | consumed samples: 5955584 | consumed tokens: 12197036032 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612982E+00 | loss scale: 2048.0 | grad norm: 4.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.117 | TFLOPs: 42.40 | +[default7]: iteration 5817/ 6200 | consumed samples: 5956608 | consumed tokens: 12199133184 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629532E+00 | loss scale: 2048.0 | grad norm: 5.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.101 | TFLOPs: 42.40 | +[default7]: iteration 5818/ 6200 | consumed samples: 5957632 | consumed tokens: 12201230336 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612555E+00 | loss scale: 2048.0 | grad norm: 5.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.005 | TFLOPs: 42.37 | +[default7]: iteration 5819/ 6200 | consumed samples: 5958656 | consumed tokens: 12203327488 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628686E+00 | loss scale: 2048.0 | grad norm: 6.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.736 | TFLOPs: 42.29 | +[default7]: iteration 5820/ 6200 | consumed samples: 5959680 | consumed tokens: 12205424640 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.597989E+00 | loss scale: 2048.0 | grad norm: 5.789 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.998 | TFLOPs: 42.37 | +[default7]: iteration 5821/ 6200 | consumed samples: 5960704 | consumed tokens: 12207521792 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618579E+00 | loss scale: 2048.0 | grad norm: 5.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.792 | TFLOPs: 42.30 | +[default7]: iteration 5822/ 6200 | consumed samples: 5961728 | consumed tokens: 12209618944 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.606221E+00 | loss scale: 2048.0 | grad norm: 5.744 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.075 | TFLOPs: 42.39 | +[default7]: iteration 5823/ 6200 | consumed samples: 5962752 | consumed tokens: 12211716096 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.607257E+00 | loss scale: 2048.0 | grad norm: 5.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.846 | TFLOPs: 42.32 | +[default7]: iteration 5824/ 6200 | consumed samples: 5963776 | consumed tokens: 12213813248 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.652802E+00 | loss scale: 2048.0 | grad norm: 5.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.532 | TFLOPs: 42.23 | +[default7]: iteration 5825/ 6200 | consumed samples: 5964800 | consumed tokens: 12215910400 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610781E+00 | loss scale: 2048.0 | grad norm: 5.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.552 | TFLOPs: 42.23 | +[default7]: iteration 5826/ 6200 | consumed samples: 5965824 | consumed tokens: 12218007552 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638849E+00 | loss scale: 2048.0 | grad norm: 6.009 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.808 | TFLOPs: 42.31 | +[default7]: iteration 5827/ 6200 | consumed samples: 5966848 | consumed tokens: 12220104704 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626673E+00 | loss scale: 2048.0 | grad norm: 5.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.715 | TFLOPs: 42.28 | +[default7]: iteration 5828/ 6200 | consumed samples: 5967872 | consumed tokens: 12222201856 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624582E+00 | loss scale: 2048.0 | grad norm: 4.845 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.919 | TFLOPs: 42.34 | +[default7]: iteration 5829/ 6200 | consumed samples: 5968896 | consumed tokens: 12224299008 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.604430E+00 | loss scale: 2048.0 | grad norm: 5.043 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.959 | TFLOPs: 42.36 | +[default7]: iteration 5830/ 6200 | consumed samples: 5969920 | consumed tokens: 12226396160 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641765E+00 | loss scale: 2048.0 | grad norm: 5.874 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.092 | TFLOPs: 42.40 | +[default7]: iteration 5831/ 6200 | consumed samples: 5970944 | consumed tokens: 12228493312 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628920E+00 | loss scale: 2048.0 | grad norm: 5.686 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.995 | TFLOPs: 42.37 | +[default7]: iteration 5832/ 6200 | consumed samples: 5971968 | consumed tokens: 12230590464 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.600528E+00 | loss scale: 2048.0 | grad norm: 6.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.112 | TFLOPs: 42.40 | +[default7]: iteration 5833/ 6200 | consumed samples: 5972992 | consumed tokens: 12232687616 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654119E+00 | loss scale: 2048.0 | grad norm: 5.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.047 | TFLOPs: 42.38 | +[default7]: iteration 5834/ 6200 | consumed samples: 5974016 | consumed tokens: 12234784768 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629593E+00 | loss scale: 2048.0 | grad norm: 5.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.287 | TFLOPs: 42.46 | +[default7]: iteration 5835/ 6200 | consumed samples: 5975040 | consumed tokens: 12236881920 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617939E+00 | loss scale: 2048.0 | grad norm: 6.752 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.196 | TFLOPs: 42.43 | +[default7]: iteration 5836/ 6200 | consumed samples: 5976064 | consumed tokens: 12238979072 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639686E+00 | loss scale: 2048.0 | grad norm: 6.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.212 | TFLOPs: 42.43 | +[default7]: iteration 5837/ 6200 | consumed samples: 5977088 | consumed tokens: 12241076224 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.649815E+00 | loss scale: 2048.0 | grad norm: 5.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.076 | TFLOPs: 42.39 | +[default7]: iteration 5838/ 6200 | consumed samples: 5978112 | consumed tokens: 12243173376 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612456E+00 | loss scale: 2048.0 | grad norm: 5.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.199 | TFLOPs: 42.43 | +[default7]: iteration 5839/ 6200 | consumed samples: 5979136 | consumed tokens: 12245270528 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.596070E+00 | loss scale: 2048.0 | grad norm: 6.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.085 | TFLOPs: 42.39 | +[default7]: iteration 5840/ 6200 | consumed samples: 5980160 | consumed tokens: 12247367680 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629385E+00 | loss scale: 2048.0 | grad norm: 6.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.977 | TFLOPs: 42.36 | +[default7]: iteration 5841/ 6200 | consumed samples: 5981184 | consumed tokens: 12249464832 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638513E+00 | loss scale: 2048.0 | grad norm: 5.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.139 | TFLOPs: 42.41 | +[default7]: iteration 5842/ 6200 | consumed samples: 5982208 | consumed tokens: 12251561984 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639576E+00 | loss scale: 2048.0 | grad norm: 6.010 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.283 | TFLOPs: 42.45 | +[default7]: iteration 5843/ 6200 | consumed samples: 5983232 | consumed tokens: 12253659136 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637640E+00 | loss scale: 2048.0 | grad norm: 6.865 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.101 | TFLOPs: 42.40 | +[default7]: iteration 5844/ 6200 | consumed samples: 5984256 | consumed tokens: 12255756288 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627283E+00 | loss scale: 2048.0 | grad norm: 5.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.150 | TFLOPs: 42.41 | +[default7]: iteration 5845/ 6200 | consumed samples: 5985280 | consumed tokens: 12257853440 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624982E+00 | loss scale: 2048.0 | grad norm: 5.681 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.140 | TFLOPs: 42.41 | +[default7]: iteration 5846/ 6200 | consumed samples: 5986304 | consumed tokens: 12259950592 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634115E+00 | loss scale: 2048.0 | grad norm: 5.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.296 | TFLOPs: 42.46 | +[default7]: iteration 5847/ 6200 | consumed samples: 5987328 | consumed tokens: 12262047744 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.607981E+00 | loss scale: 2048.0 | grad norm: 5.952 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.079 | TFLOPs: 42.39 | +[default7]: iteration 5848/ 6200 | consumed samples: 5988352 | consumed tokens: 12264144896 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646838E+00 | loss scale: 2048.0 | grad norm: 5.956 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.122 | TFLOPs: 42.41 | +[default7]: iteration 5849/ 6200 | consumed samples: 5989376 | consumed tokens: 12266242048 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.635991E+00 | loss scale: 2048.0 | grad norm: 6.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.988 | TFLOPs: 42.36 | +[default7]: iteration 5850/ 6200 | consumed samples: 5990400 | consumed tokens: 12268339200 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648823E+00 | loss scale: 2048.0 | grad norm: 5.627 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.155 | TFLOPs: 42.42 | +[default7]: iteration 5851/ 6200 | consumed samples: 5991424 | consumed tokens: 12270436352 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642030E+00 | loss scale: 2048.0 | grad norm: 6.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.303 | TFLOPs: 42.46 | +[default7]: iteration 5852/ 6200 | consumed samples: 5992448 | consumed tokens: 12272533504 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.600672E+00 | loss scale: 2048.0 | grad norm: 6.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.074 | TFLOPs: 42.39 | +[default7]: iteration 5853/ 6200 | consumed samples: 5993472 | consumed tokens: 12274630656 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628190E+00 | loss scale: 2048.0 | grad norm: 5.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.316 | TFLOPs: 42.46 | +[default7]: iteration 5854/ 6200 | consumed samples: 5994496 | consumed tokens: 12276727808 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.607658E+00 | loss scale: 2048.0 | grad norm: 5.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.287 | TFLOPs: 42.46 | +[default7]: iteration 5855/ 6200 | consumed samples: 5995520 | consumed tokens: 12278824960 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611346E+00 | loss scale: 2048.0 | grad norm: 5.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.236 | TFLOPs: 42.44 | +[default7]: iteration 5856/ 6200 | consumed samples: 5996544 | consumed tokens: 12280922112 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.657095E+00 | loss scale: 2048.0 | grad norm: 5.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.373 | TFLOPs: 42.48 | +[default7]: iteration 5857/ 6200 | consumed samples: 5997568 | consumed tokens: 12283019264 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633022E+00 | loss scale: 2048.0 | grad norm: 5.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.369 | TFLOPs: 42.48 | +[default7]: iteration 5858/ 6200 | consumed samples: 5998592 | consumed tokens: 12285116416 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617418E+00 | loss scale: 2048.0 | grad norm: 4.915 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.258 | TFLOPs: 42.45 | +[default7]: iteration 5859/ 6200 | consumed samples: 5999616 | consumed tokens: 12287213568 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615495E+00 | loss scale: 2048.0 | grad norm: 5.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.125 | TFLOPs: 42.41 | +[default7]: iteration 5860/ 6200 | consumed samples: 6000640 | consumed tokens: 12289310720 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638435E+00 | loss scale: 2048.0 | grad norm: 4.780 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.146 | TFLOPs: 42.41 | +[default7]: iteration 5861/ 6200 | consumed samples: 6001664 | consumed tokens: 12291407872 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644004E+00 | loss scale: 2048.0 | grad norm: 5.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.073 | TFLOPs: 42.39 | +[default7]: iteration 5862/ 6200 | consumed samples: 6002688 | consumed tokens: 12293505024 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637963E+00 | loss scale: 2048.0 | grad norm: 4.772 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.146 | TFLOPs: 42.41 | +[default7]: iteration 5863/ 6200 | consumed samples: 6003712 | consumed tokens: 12295602176 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.569471E+00 | loss scale: 2048.0 | grad norm: 4.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.259 | TFLOPs: 42.45 | +[default7]: iteration 5864/ 6200 | consumed samples: 6004736 | consumed tokens: 12297699328 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624282E+00 | loss scale: 2048.0 | grad norm: 4.752 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.039 | TFLOPs: 42.38 | +[default7]: iteration 5865/ 6200 | consumed samples: 6005760 | consumed tokens: 12299796480 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651162E+00 | loss scale: 2048.0 | grad norm: 5.750 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.014 | TFLOPs: 42.37 | +[default7]: iteration 5866/ 6200 | consumed samples: 6006784 | consumed tokens: 12301893632 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614199E+00 | loss scale: 2048.0 | grad norm: 4.849 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.018 | TFLOPs: 42.37 | +[default7]: iteration 5867/ 6200 | consumed samples: 6007808 | consumed tokens: 12303990784 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.603634E+00 | loss scale: 2048.0 | grad norm: 5.559 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.949 | TFLOPs: 42.35 | +[default7]: iteration 5868/ 6200 | consumed samples: 6008832 | consumed tokens: 12306087936 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641076E+00 | loss scale: 2048.0 | grad norm: 5.640 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.076 | TFLOPs: 42.39 | +[default7]: iteration 5869/ 6200 | consumed samples: 6009856 | consumed tokens: 12308185088 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.598730E+00 | loss scale: 2048.0 | grad norm: 6.053 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.016 | TFLOPs: 42.37 | +[default7]: iteration 5870/ 6200 | consumed samples: 6010880 | consumed tokens: 12310282240 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638090E+00 | loss scale: 2048.0 | grad norm: 5.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.750 | TFLOPs: 42.29 | +[default7]: iteration 5871/ 6200 | consumed samples: 6011904 | consumed tokens: 12312379392 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643220E+00 | loss scale: 2048.0 | grad norm: 4.712 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.708 | TFLOPs: 42.28 | +[default7]: iteration 5872/ 6200 | consumed samples: 6012928 | consumed tokens: 12314476544 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.663949E+00 | loss scale: 2048.0 | grad norm: 6.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.757 | TFLOPs: 42.29 | +[default7]: iteration 5873/ 6200 | consumed samples: 6013952 | consumed tokens: 12316573696 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619652E+00 | loss scale: 2048.0 | grad norm: 5.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.997 | TFLOPs: 42.37 | +[default7]: iteration 5874/ 6200 | consumed samples: 6014976 | consumed tokens: 12318670848 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.608355E+00 | loss scale: 2048.0 | grad norm: 5.874 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.326 | TFLOPs: 42.47 | +[default7]: iteration 5875/ 6200 | consumed samples: 6016000 | consumed tokens: 12320768000 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.616462E+00 | loss scale: 2048.0 | grad norm: 5.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.937 | TFLOPs: 42.35 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 5875 | lm loss value: 3.665186E+00 | lm loss PPL: 3.906340E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 5875 | lm loss value: 1.524679E+00 | lm loss PPL: 4.593669E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 5876/ 6200 | consumed samples: 6017024 | consumed tokens: 12322865152 | elapsed time per iteration (s): 52.26 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640579E+00 | loss scale: 2048.0 | grad norm: 5.745 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.593 | TFLOPs: 5.97 | +[default7]: iteration 5877/ 6200 | consumed samples: 6018048 | consumed tokens: 12324962304 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628155E+00 | loss scale: 2048.0 | grad norm: 5.779 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.834 | TFLOPs: 42.32 | +[default7]: iteration 5878/ 6200 | consumed samples: 6019072 | consumed tokens: 12327059456 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623537E+00 | loss scale: 2048.0 | grad norm: 4.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.776 | TFLOPs: 42.30 | +[default7]: iteration 5879/ 6200 | consumed samples: 6020096 | consumed tokens: 12329156608 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639467E+00 | loss scale: 2048.0 | grad norm: 4.792 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.846 | TFLOPs: 42.32 | +[default7]: iteration 5880/ 6200 | consumed samples: 6021120 | consumed tokens: 12331253760 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.606177E+00 | loss scale: 2048.0 | grad norm: 5.724 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.461 | TFLOPs: 42.20 | +[default7]: iteration 5881/ 6200 | consumed samples: 6022144 | consumed tokens: 12333350912 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.585009E+00 | loss scale: 2048.0 | grad norm: 5.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.810 | TFLOPs: 42.31 | +[default7]: iteration 5882/ 6200 | consumed samples: 6023168 | consumed tokens: 12335448064 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.599064E+00 | loss scale: 2048.0 | grad norm: 5.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.108 | TFLOPs: 42.40 | +[default7]: iteration 5883/ 6200 | consumed samples: 6024192 | consumed tokens: 12337545216 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.602845E+00 | loss scale: 2048.0 | grad norm: 4.741 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.066 | TFLOPs: 42.39 | +[default7]: iteration 5884/ 6200 | consumed samples: 6025216 | consumed tokens: 12339642368 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618793E+00 | loss scale: 2048.0 | grad norm: 5.774 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.120 | TFLOPs: 42.41 | +[default7]: iteration 5885/ 6200 | consumed samples: 6026240 | consumed tokens: 12341739520 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.592492E+00 | loss scale: 2048.0 | grad norm: 5.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.941 | TFLOPs: 42.35 | +[default7]: iteration 5886/ 6200 | consumed samples: 6027264 | consumed tokens: 12343836672 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643834E+00 | loss scale: 2048.0 | grad norm: 7.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.042 | TFLOPs: 42.38 | +[default7]: iteration 5887/ 6200 | consumed samples: 6028288 | consumed tokens: 12345933824 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623789E+00 | loss scale: 2048.0 | grad norm: 7.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.070 | TFLOPs: 42.39 | +[default7]: iteration 5888/ 6200 | consumed samples: 6029312 | consumed tokens: 12348030976 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.607121E+00 | loss scale: 2048.0 | grad norm: 5.887 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.164 | TFLOPs: 42.42 | +[default7]: iteration 5889/ 6200 | consumed samples: 6030336 | consumed tokens: 12350128128 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629528E+00 | loss scale: 2048.0 | grad norm: 5.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.084 | TFLOPs: 42.39 | +[default7]: iteration 5890/ 6200 | consumed samples: 6031360 | consumed tokens: 12352225280 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640617E+00 | loss scale: 2048.0 | grad norm: 6.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.074 | TFLOPs: 42.39 | +[default7]: iteration 5891/ 6200 | consumed samples: 6032384 | consumed tokens: 12354322432 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623506E+00 | loss scale: 2048.0 | grad norm: 6.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.899 | TFLOPs: 42.34 | +[default7]: iteration 5892/ 6200 | consumed samples: 6033408 | consumed tokens: 12356419584 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626795E+00 | loss scale: 2048.0 | grad norm: 5.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.021 | TFLOPs: 42.37 | +[default7]: iteration 5893/ 6200 | consumed samples: 6034432 | consumed tokens: 12358516736 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640849E+00 | loss scale: 2048.0 | grad norm: 4.618 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.286 | TFLOPs: 42.46 | +[default7]: iteration 5894/ 6200 | consumed samples: 6035456 | consumed tokens: 12360613888 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637028E+00 | loss scale: 2048.0 | grad norm: 5.010 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.149 | TFLOPs: 42.41 | +[default7]: iteration 5895/ 6200 | consumed samples: 6036480 | consumed tokens: 12362711040 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.596593E+00 | loss scale: 2048.0 | grad norm: 5.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.148 | TFLOPs: 42.41 | +[default7]: iteration 5896/ 6200 | consumed samples: 6037504 | consumed tokens: 12364808192 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609140E+00 | loss scale: 2048.0 | grad norm: 6.984 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.006 | TFLOPs: 42.37 | +[default7]: iteration 5897/ 6200 | consumed samples: 6038528 | consumed tokens: 12366905344 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628915E+00 | loss scale: 2048.0 | grad norm: 5.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.944 | TFLOPs: 42.35 | +[default0]:[2022-10-07 01:24:49,841] [INFO] [stage_1_and_2.py:1720:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2048.0, reducing to 2048.0 +[default7]: iteration 5898/ 6200 | consumed samples: 6039552 | consumed tokens: 12369002496 | elapsed time per iteration (s): 7.25 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.616922E+00 | loss scale: 2048.0 | grad norm: 5.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 141.337 | TFLOPs: 43.08 | +[default7]: iteration 5899/ 6200 | consumed samples: 6040576 | consumed tokens: 12371099648 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.613580E+00 | loss scale: 2048.0 | grad norm: 6.745 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.256 | TFLOPs: 42.45 | +[default7]: iteration 5900/ 6200 | consumed samples: 6041600 | consumed tokens: 12373196800 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625520E+00 | loss scale: 2048.0 | grad norm: 7.790 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.885 | TFLOPs: 42.33 | +[default7]: iteration 5901/ 6200 | consumed samples: 6042624 | consumed tokens: 12375293952 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.645244E+00 | loss scale: 2048.0 | grad norm: 5.725 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.065 | TFLOPs: 42.39 | +[default7]: iteration 5902/ 6200 | consumed samples: 6043648 | consumed tokens: 12377391104 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.616621E+00 | loss scale: 2048.0 | grad norm: 5.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.064 | TFLOPs: 42.39 | +[default7]: iteration 5903/ 6200 | consumed samples: 6044672 | consumed tokens: 12379488256 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609472E+00 | loss scale: 2048.0 | grad norm: 4.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.785 | TFLOPs: 42.30 | +[default7]: iteration 5904/ 6200 | consumed samples: 6045696 | consumed tokens: 12381585408 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.581729E+00 | loss scale: 2048.0 | grad norm: 6.828 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.900 | TFLOPs: 42.34 | +[default7]: iteration 5905/ 6200 | consumed samples: 6046720 | consumed tokens: 12383682560 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611867E+00 | loss scale: 2048.0 | grad norm: 5.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.948 | TFLOPs: 42.35 | +[default7]: iteration 5906/ 6200 | consumed samples: 6047744 | consumed tokens: 12385779712 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.604676E+00 | loss scale: 2048.0 | grad norm: 6.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.039 | TFLOPs: 42.38 | +[default7]: iteration 5907/ 6200 | consumed samples: 6048768 | consumed tokens: 12387876864 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629242E+00 | loss scale: 2048.0 | grad norm: 4.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.078 | TFLOPs: 42.39 | +[default7]: iteration 5908/ 6200 | consumed samples: 6049792 | consumed tokens: 12389974016 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618711E+00 | loss scale: 2048.0 | grad norm: 5.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.804 | TFLOPs: 42.31 | +[default7]: iteration 5909/ 6200 | consumed samples: 6050816 | consumed tokens: 12392071168 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624263E+00 | loss scale: 2048.0 | grad norm: 5.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.653 | TFLOPs: 42.26 | +[default7]: iteration 5910/ 6200 | consumed samples: 6051840 | consumed tokens: 12394168320 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.608039E+00 | loss scale: 2048.0 | grad norm: 5.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.672 | TFLOPs: 42.27 | +[default7]: iteration 5911/ 6200 | consumed samples: 6052864 | consumed tokens: 12396265472 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.594253E+00 | loss scale: 2048.0 | grad norm: 5.763 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.478 | TFLOPs: 42.21 | +[default7]: iteration 5912/ 6200 | consumed samples: 6053888 | consumed tokens: 12398362624 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620481E+00 | loss scale: 2048.0 | grad norm: 6.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.752 | TFLOPs: 42.29 | +[default7]: iteration 5913/ 6200 | consumed samples: 6054912 | consumed tokens: 12400459776 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.594741E+00 | loss scale: 2048.0 | grad norm: 5.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.482 | TFLOPs: 42.21 | +[default7]: iteration 5914/ 6200 | consumed samples: 6055936 | consumed tokens: 12402556928 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622436E+00 | loss scale: 2048.0 | grad norm: 4.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.631 | TFLOPs: 42.26 | +[default7]: iteration 5915/ 6200 | consumed samples: 6056960 | consumed tokens: 12404654080 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615149E+00 | loss scale: 2048.0 | grad norm: 5.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.607 | TFLOPs: 42.25 | +[default7]: iteration 5916/ 6200 | consumed samples: 6057984 | consumed tokens: 12406751232 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621676E+00 | loss scale: 2048.0 | grad norm: 5.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.654 | TFLOPs: 42.26 | +[default7]: iteration 5917/ 6200 | consumed samples: 6059008 | consumed tokens: 12408848384 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615225E+00 | loss scale: 2048.0 | grad norm: 4.946 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.649 | TFLOPs: 42.26 | +[default7]: iteration 5918/ 6200 | consumed samples: 6060032 | consumed tokens: 12410945536 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625259E+00 | loss scale: 2048.0 | grad norm: 5.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.614 | TFLOPs: 42.25 | +[default7]: iteration 5919/ 6200 | consumed samples: 6061056 | consumed tokens: 12413042688 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.648968E+00 | loss scale: 2048.0 | grad norm: 5.030 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.737 | TFLOPs: 42.29 | +[default7]: iteration 5920/ 6200 | consumed samples: 6062080 | consumed tokens: 12415139840 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632495E+00 | loss scale: 2048.0 | grad norm: 5.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.828 | TFLOPs: 42.32 | +[default7]: iteration 5921/ 6200 | consumed samples: 6063104 | consumed tokens: 12417236992 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621554E+00 | loss scale: 2048.0 | grad norm: 5.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.805 | TFLOPs: 42.31 | +[default7]: iteration 5922/ 6200 | consumed samples: 6064128 | consumed tokens: 12419334144 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621118E+00 | loss scale: 2048.0 | grad norm: 5.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.028 | TFLOPs: 42.38 | +[default7]: iteration 5923/ 6200 | consumed samples: 6065152 | consumed tokens: 12421431296 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634554E+00 | loss scale: 2048.0 | grad norm: 5.583 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.847 | TFLOPs: 42.32 | +[default7]: iteration 5924/ 6200 | consumed samples: 6066176 | consumed tokens: 12423528448 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.613179E+00 | loss scale: 2048.0 | grad norm: 5.074 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.725 | TFLOPs: 42.28 | +[default7]: iteration 5925/ 6200 | consumed samples: 6067200 | consumed tokens: 12425625600 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631511E+00 | loss scale: 2048.0 | grad norm: 4.977 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.163 | TFLOPs: 42.42 | +[default7]: iteration 5926/ 6200 | consumed samples: 6068224 | consumed tokens: 12427722752 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624390E+00 | loss scale: 2048.0 | grad norm: 4.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.435 | TFLOPs: 42.20 | +[default7]: iteration 5927/ 6200 | consumed samples: 6069248 | consumed tokens: 12429819904 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610704E+00 | loss scale: 2048.0 | grad norm: 5.555 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.804 | TFLOPs: 42.31 | +[default7]: iteration 5928/ 6200 | consumed samples: 6070272 | consumed tokens: 12431917056 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640490E+00 | loss scale: 2048.0 | grad norm: 7.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.375 | TFLOPs: 42.18 | +[default7]: iteration 5929/ 6200 | consumed samples: 6071296 | consumed tokens: 12434014208 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.597793E+00 | loss scale: 2048.0 | grad norm: 5.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.087 | TFLOPs: 42.39 | +[default7]: iteration 5930/ 6200 | consumed samples: 6072320 | consumed tokens: 12436111360 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612926E+00 | loss scale: 2048.0 | grad norm: 5.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.089 | TFLOPs: 42.40 | +[default7]: iteration 5931/ 6200 | consumed samples: 6073344 | consumed tokens: 12438208512 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.616702E+00 | loss scale: 2048.0 | grad norm: 5.898 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.019 | TFLOPs: 42.37 | +[default7]: iteration 5932/ 6200 | consumed samples: 6074368 | consumed tokens: 12440305664 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.605184E+00 | loss scale: 2048.0 | grad norm: 7.040 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.975 | TFLOPs: 42.36 | +[default7]: iteration 5933/ 6200 | consumed samples: 6075392 | consumed tokens: 12442402816 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631151E+00 | loss scale: 2048.0 | grad norm: 6.043 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.658 | TFLOPs: 42.26 | +[default7]: iteration 5934/ 6200 | consumed samples: 6076416 | consumed tokens: 12444499968 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632125E+00 | loss scale: 2048.0 | grad norm: 5.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.648 | TFLOPs: 42.26 | +[default7]: iteration 5935/ 6200 | consumed samples: 6077440 | consumed tokens: 12446597120 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651613E+00 | loss scale: 2048.0 | grad norm: 6.029 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.761 | TFLOPs: 42.30 | +[default7]: iteration 5936/ 6200 | consumed samples: 6078464 | consumed tokens: 12448694272 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.601464E+00 | loss scale: 2048.0 | grad norm: 6.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.802 | TFLOPs: 42.31 | +[default7]: iteration 5937/ 6200 | consumed samples: 6079488 | consumed tokens: 12450791424 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650498E+00 | loss scale: 2048.0 | grad norm: 5.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.608 | TFLOPs: 42.25 | +[default7]: iteration 5938/ 6200 | consumed samples: 6080512 | consumed tokens: 12452888576 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.595878E+00 | loss scale: 2048.0 | grad norm: 6.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.647 | TFLOPs: 42.26 | +[default7]: iteration 5939/ 6200 | consumed samples: 6081536 | consumed tokens: 12454985728 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.604908E+00 | loss scale: 2048.0 | grad norm: 5.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.708 | TFLOPs: 42.28 | +[default7]: iteration 5940/ 6200 | consumed samples: 6082560 | consumed tokens: 12457082880 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609425E+00 | loss scale: 2048.0 | grad norm: 4.951 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.571 | TFLOPs: 42.24 | +[default7]: iteration 5941/ 6200 | consumed samples: 6083584 | consumed tokens: 12459180032 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623330E+00 | loss scale: 2048.0 | grad norm: 4.885 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.757 | TFLOPs: 42.29 | +[default7]: iteration 5942/ 6200 | consumed samples: 6084608 | consumed tokens: 12461277184 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621971E+00 | loss scale: 2048.0 | grad norm: 6.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.451 | TFLOPs: 42.20 | +[default7]: iteration 5943/ 6200 | consumed samples: 6085632 | consumed tokens: 12463374336 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646114E+00 | loss scale: 2048.0 | grad norm: 4.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.379 | TFLOPs: 42.18 | +[default7]: iteration 5944/ 6200 | consumed samples: 6086656 | consumed tokens: 12465471488 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620016E+00 | loss scale: 2048.0 | grad norm: 5.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.827 | TFLOPs: 42.32 | +[default7]: iteration 5945/ 6200 | consumed samples: 6087680 | consumed tokens: 12467568640 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628088E+00 | loss scale: 2048.0 | grad norm: 5.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.849 | TFLOPs: 42.32 | +[default7]: iteration 5946/ 6200 | consumed samples: 6088704 | consumed tokens: 12469665792 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615790E+00 | loss scale: 2048.0 | grad norm: 5.877 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.004 | TFLOPs: 42.37 | +[default7]: iteration 5947/ 6200 | consumed samples: 6089728 | consumed tokens: 12471762944 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631334E+00 | loss scale: 2048.0 | grad norm: 4.902 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.911 | TFLOPs: 42.34 | +[default7]: iteration 5948/ 6200 | consumed samples: 6090752 | consumed tokens: 12473860096 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615596E+00 | loss scale: 2048.0 | grad norm: 4.837 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.928 | TFLOPs: 42.35 | +[default7]: iteration 5949/ 6200 | consumed samples: 6091776 | consumed tokens: 12475957248 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621256E+00 | loss scale: 2048.0 | grad norm: 5.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.812 | TFLOPs: 42.31 | +[default7]: iteration 5950/ 6200 | consumed samples: 6092800 | consumed tokens: 12478054400 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615267E+00 | loss scale: 2048.0 | grad norm: 5.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.783 | TFLOPs: 42.30 | +[default7]: iteration 5951/ 6200 | consumed samples: 6093824 | consumed tokens: 12480151552 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.596392E+00 | loss scale: 2048.0 | grad norm: 4.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.230 | TFLOPs: 42.44 | +[default7]: iteration 5952/ 6200 | consumed samples: 6094848 | consumed tokens: 12482248704 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628030E+00 | loss scale: 2048.0 | grad norm: 4.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.336 | TFLOPs: 42.47 | +[default7]: iteration 5953/ 6200 | consumed samples: 6095872 | consumed tokens: 12484345856 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.597124E+00 | loss scale: 2048.0 | grad norm: 4.996 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.217 | TFLOPs: 42.43 | +[default7]: iteration 5954/ 6200 | consumed samples: 6096896 | consumed tokens: 12486443008 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618466E+00 | loss scale: 2048.0 | grad norm: 5.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.148 | TFLOPs: 42.41 | +[default7]: iteration 5955/ 6200 | consumed samples: 6097920 | consumed tokens: 12488540160 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629689E+00 | loss scale: 2048.0 | grad norm: 6.068 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.992 | TFLOPs: 42.37 | +[default7]: iteration 5956/ 6200 | consumed samples: 6098944 | consumed tokens: 12490637312 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646263E+00 | loss scale: 2048.0 | grad norm: 4.681 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.978 | TFLOPs: 42.36 | +[default7]: iteration 5957/ 6200 | consumed samples: 6099968 | consumed tokens: 12492734464 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638296E+00 | loss scale: 2048.0 | grad norm: 5.606 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.734 | TFLOPs: 42.29 | +[default7]: iteration 5958/ 6200 | consumed samples: 6100992 | consumed tokens: 12494831616 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614128E+00 | loss scale: 2048.0 | grad norm: 4.856 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.087 | TFLOPs: 42.39 | +[default7]: iteration 5959/ 6200 | consumed samples: 6102016 | consumed tokens: 12496928768 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.606193E+00 | loss scale: 2048.0 | grad norm: 4.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.876 | TFLOPs: 42.33 | +[default7]: iteration 5960/ 6200 | consumed samples: 6103040 | consumed tokens: 12499025920 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633677E+00 | loss scale: 2048.0 | grad norm: 4.762 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.028 | TFLOPs: 42.38 | +[default7]: iteration 5961/ 6200 | consumed samples: 6104064 | consumed tokens: 12501123072 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623859E+00 | loss scale: 2048.0 | grad norm: 4.892 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.878 | TFLOPs: 42.33 | +[default7]: iteration 5962/ 6200 | consumed samples: 6105088 | consumed tokens: 12503220224 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.613765E+00 | loss scale: 2048.0 | grad norm: 4.809 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.971 | TFLOPs: 42.36 | +[default7]: iteration 5963/ 6200 | consumed samples: 6106112 | consumed tokens: 12505317376 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644491E+00 | loss scale: 2048.0 | grad norm: 5.074 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.786 | TFLOPs: 42.30 | +[default7]: iteration 5964/ 6200 | consumed samples: 6107136 | consumed tokens: 12507414528 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624784E+00 | loss scale: 2048.0 | grad norm: 4.786 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.919 | TFLOPs: 42.34 | +[default7]: iteration 5965/ 6200 | consumed samples: 6108160 | consumed tokens: 12509511680 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623443E+00 | loss scale: 2048.0 | grad norm: 4.924 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.021 | TFLOPs: 42.37 | +[default7]: iteration 5966/ 6200 | consumed samples: 6109184 | consumed tokens: 12511608832 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622696E+00 | loss scale: 2048.0 | grad norm: 7.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.889 | TFLOPs: 42.33 | +[default7]: iteration 5967/ 6200 | consumed samples: 6110208 | consumed tokens: 12513705984 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620533E+00 | loss scale: 2048.0 | grad norm: 5.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.097 | TFLOPs: 42.40 | +[default7]: iteration 5968/ 6200 | consumed samples: 6111232 | consumed tokens: 12515803136 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617423E+00 | loss scale: 2048.0 | grad norm: 4.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.072 | TFLOPs: 42.39 | +[default7]: iteration 5969/ 6200 | consumed samples: 6112256 | consumed tokens: 12517900288 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626783E+00 | loss scale: 2048.0 | grad norm: 5.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.849 | TFLOPs: 42.32 | +[default7]: iteration 5970/ 6200 | consumed samples: 6113280 | consumed tokens: 12519997440 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615537E+00 | loss scale: 2048.0 | grad norm: 5.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.786 | TFLOPs: 42.30 | +[default7]: iteration 5971/ 6200 | consumed samples: 6114304 | consumed tokens: 12522094592 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.602648E+00 | loss scale: 2048.0 | grad norm: 4.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.014 | TFLOPs: 42.37 | +[default7]: iteration 5972/ 6200 | consumed samples: 6115328 | consumed tokens: 12524191744 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637566E+00 | loss scale: 2048.0 | grad norm: 5.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.957 | TFLOPs: 42.36 | +[default7]: iteration 5973/ 6200 | consumed samples: 6116352 | consumed tokens: 12526288896 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610809E+00 | loss scale: 2048.0 | grad norm: 4.882 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.942 | TFLOPs: 42.35 | +[default7]: iteration 5974/ 6200 | consumed samples: 6117376 | consumed tokens: 12528386048 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.646945E+00 | loss scale: 2048.0 | grad norm: 6.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.986 | TFLOPs: 42.36 | +[default7]: iteration 5975/ 6200 | consumed samples: 6118400 | consumed tokens: 12530483200 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619092E+00 | loss scale: 2048.0 | grad norm: 4.851 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.663 | TFLOPs: 42.27 | +[default7]: iteration 5976/ 6200 | consumed samples: 6119424 | consumed tokens: 12532580352 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.596813E+00 | loss scale: 2048.0 | grad norm: 5.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.942 | TFLOPs: 42.35 | +[default7]: iteration 5977/ 6200 | consumed samples: 6120448 | consumed tokens: 12534677504 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626476E+00 | loss scale: 2048.0 | grad norm: 4.982 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.131 | TFLOPs: 42.41 | +[default7]: iteration 5978/ 6200 | consumed samples: 6121472 | consumed tokens: 12536774656 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627978E+00 | loss scale: 2048.0 | grad norm: 4.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.747 | TFLOPs: 42.29 | +[default7]: iteration 5979/ 6200 | consumed samples: 6122496 | consumed tokens: 12538871808 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609752E+00 | loss scale: 2048.0 | grad norm: 5.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.100 | TFLOPs: 42.40 | +[default7]: iteration 5980/ 6200 | consumed samples: 6123520 | consumed tokens: 12540968960 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627098E+00 | loss scale: 2048.0 | grad norm: 6.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.961 | TFLOPs: 42.36 | +[default7]: iteration 5981/ 6200 | consumed samples: 6124544 | consumed tokens: 12543066112 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618033E+00 | loss scale: 2048.0 | grad norm: 4.958 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.862 | TFLOPs: 42.33 | +[default7]: iteration 5982/ 6200 | consumed samples: 6125568 | consumed tokens: 12545163264 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642540E+00 | loss scale: 2048.0 | grad norm: 5.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.928 | TFLOPs: 42.35 | +[default7]: iteration 5983/ 6200 | consumed samples: 6126592 | consumed tokens: 12547260416 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625028E+00 | loss scale: 2048.0 | grad norm: 6.035 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.009 | TFLOPs: 42.37 | +[default7]: iteration 5984/ 6200 | consumed samples: 6127616 | consumed tokens: 12549357568 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.608402E+00 | loss scale: 2048.0 | grad norm: 5.068 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.004 | TFLOPs: 42.37 | +[default7]: iteration 5985/ 6200 | consumed samples: 6128640 | consumed tokens: 12551454720 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.605927E+00 | loss scale: 2048.0 | grad norm: 5.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.028 | TFLOPs: 42.38 | +[default7]: iteration 5986/ 6200 | consumed samples: 6129664 | consumed tokens: 12553551872 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612826E+00 | loss scale: 2048.0 | grad norm: 5.923 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.145 | TFLOPs: 42.41 | +[default7]: iteration 5987/ 6200 | consumed samples: 6130688 | consumed tokens: 12555649024 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627307E+00 | loss scale: 2048.0 | grad norm: 5.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.076 | TFLOPs: 42.39 | +[default7]: iteration 5988/ 6200 | consumed samples: 6131712 | consumed tokens: 12557746176 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626592E+00 | loss scale: 2048.0 | grad norm: 4.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.801 | TFLOPs: 42.31 | +[default7]: iteration 5989/ 6200 | consumed samples: 6132736 | consumed tokens: 12559843328 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631177E+00 | loss scale: 2048.0 | grad norm: 5.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.843 | TFLOPs: 42.32 | +[default7]: iteration 5990/ 6200 | consumed samples: 6133760 | consumed tokens: 12561940480 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.594359E+00 | loss scale: 2048.0 | grad norm: 5.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.840 | TFLOPs: 42.32 | +[default7]: iteration 5991/ 6200 | consumed samples: 6134784 | consumed tokens: 12564037632 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.642699E+00 | loss scale: 2048.0 | grad norm: 5.809 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.915 | TFLOPs: 42.34 | +[default7]: iteration 5992/ 6200 | consumed samples: 6135808 | consumed tokens: 12566134784 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.602332E+00 | loss scale: 2048.0 | grad norm: 5.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.796 | TFLOPs: 42.31 | +[default7]: iteration 5993/ 6200 | consumed samples: 6136832 | consumed tokens: 12568231936 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.602868E+00 | loss scale: 2048.0 | grad norm: 5.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.689 | TFLOPs: 42.27 | +[default7]: iteration 5994/ 6200 | consumed samples: 6137856 | consumed tokens: 12570329088 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609036E+00 | loss scale: 2048.0 | grad norm: 4.826 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.869 | TFLOPs: 42.33 | +[default7]: iteration 5995/ 6200 | consumed samples: 6138880 | consumed tokens: 12572426240 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.637317E+00 | loss scale: 2048.0 | grad norm: 5.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.834 | TFLOPs: 42.32 | +[default7]: iteration 5996/ 6200 | consumed samples: 6139904 | consumed tokens: 12574523392 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.593901E+00 | loss scale: 2048.0 | grad norm: 4.876 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.702 | TFLOPs: 42.28 | +[default7]: iteration 5997/ 6200 | consumed samples: 6140928 | consumed tokens: 12576620544 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.608331E+00 | loss scale: 2048.0 | grad norm: 5.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.810 | TFLOPs: 42.31 | +[default7]: iteration 5998/ 6200 | consumed samples: 6141952 | consumed tokens: 12578717696 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.602455E+00 | loss scale: 2048.0 | grad norm: 5.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.638 | TFLOPs: 42.26 | +[default7]: iteration 5999/ 6200 | consumed samples: 6142976 | consumed tokens: 12580814848 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.613104E+00 | loss scale: 2048.0 | grad norm: 4.863 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.676 | TFLOPs: 42.27 | +[default0]:[2022-10-07 01:37:22,037] [INFO] [logging.py:68:log_dist] [Rank 0] step=6000, skipped=16, lr=[2e-05, 2e-05, 2e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[default7]: iteration 6000/ 6200 | consumed samples: 6144000 | consumed tokens: 12582912000 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615722E+00 | loss scale: 2048.0 | grad norm: 4.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.659 | TFLOPs: 42.26 | +[default0]:steps: 6000 loss: 1.6157 iter time (s): 7.368 samples/sec: 138.976 +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 6000 | lm loss value: 3.702144E+00 | lm loss PPL: 4.053411E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 6000 | lm loss value: 1.514747E+00 | lm loss PPL: 4.548270E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 6000 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-07 01:38:07,637] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step6000 is begin to save! +[default0]:[2022-10-07 01:38:07,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,080] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,108] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,163] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,190] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,218] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,245] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,272] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,299] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,354] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,409] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,437] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,491] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,546] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,573] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,627] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,654] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_25-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,681] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_26-model_00-model_states.pt... +[default5]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 01:38:08,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 01:38:08,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 01:38:08,709] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/mp_rank_00_model_states.pt +[default0]:[2022-10-07 01:38:08,709] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 01:38:08,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 01:38:08,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 01:38:08,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 01:38:08,925] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 01:38:08,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 01:38:08,935] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 01:38:08,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 01:38:08,910] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 01:38:08,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 01:38:08,927] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 01:38:08,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 01:38:08,991] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 01:38:09,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 01:38:09,005] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 01:38:08,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 01:38:08,943] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 01:38:09,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 01:38:09,028] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 01:38:09,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 01:38:09,026] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 01:38:09,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 01:38:09,029] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 01:38:09,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 01:38:09,030] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 01:38:09,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 01:38:09,027] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 01:38:09,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 01:38:09,048] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 01:38:09,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 01:38:09,047] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 01:38:09,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 01:38:09,029] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 01:38:09,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 01:38:09,040] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 01:38:09,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 01:38:09,027] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 01:38:09,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 01:38:09,047] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 01:38:09,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 01:38:09,079] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 01:38:09,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 01:38:09,034] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 01:38:09,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 01:38:09,029] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 01:38:09,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 01:38:09,074] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 01:38:09,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 01:38:09,033] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 01:38:09,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 01:38:09,029] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 01:38:09,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 01:38:09,054] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default7]:[2022-10-07 01:38:09,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 01:38:09,108] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default7]:time (ms) | save-checkpoint: 1517.66 +[default4]:[2022-10-07 01:38:09,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 01:38:09,071] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default3]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default3]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default6]:[2022-10-07 01:38:09,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default2]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default0]:[2022-10-07 01:38:09,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 01:38:09,148] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 01:38:09,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default1]:[2022-10-07 01:38:09,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 01:38:09,118] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 01:38:09,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default3]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default6]:[2022-10-07 01:38:09,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default2]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default2]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default6]:[2022-10-07 01:38:09,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 01:38:09,101] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default7]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default3]:[2022-10-07 01:38:09,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default0]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default0]: successfully saved checkpoint at iteration 6000 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default2]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default0]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default4]:[2022-10-07 01:38:09,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default4]:[2022-10-07 01:38:09,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 01:38:09,138] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 01:38:09,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default5]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default0]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default7]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default5]:[2022-10-07 01:38:09,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 01:38:09,153] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6000/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 01:38:09,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default4]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default1]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default6]:[2022-10-07 01:38:09,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default1]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default5]:[2022-10-07 01:38:09,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default1]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default5]:[2022-10-07 01:38:09,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +[default7]: iteration 6001/ 6200 | consumed samples: 6145024 | consumed tokens: 12585009152 | elapsed time per iteration (s): 54.49 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631508E+00 | loss scale: 2048.0 | grad norm: 4.754 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 18.791 | TFLOPs: 5.73 | +[default7]: iteration 6002/ 6200 | consumed samples: 6146048 | consumed tokens: 12587106304 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.596619E+00 | loss scale: 2048.0 | grad norm: 5.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.642 | TFLOPs: 42.26 | +[default7]: iteration 6003/ 6200 | consumed samples: 6147072 | consumed tokens: 12589203456 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629472E+00 | loss scale: 2048.0 | grad norm: 4.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.747 | TFLOPs: 42.29 | +[default7]: iteration 6004/ 6200 | consumed samples: 6148096 | consumed tokens: 12591300608 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.603192E+00 | loss scale: 2048.0 | grad norm: 5.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.677 | TFLOPs: 42.27 | +[default7]: iteration 6005/ 6200 | consumed samples: 6149120 | consumed tokens: 12593397760 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.616481E+00 | loss scale: 2048.0 | grad norm: 4.908 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.971 | TFLOPs: 42.36 | +[default7]: iteration 6006/ 6200 | consumed samples: 6150144 | consumed tokens: 12595494912 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609403E+00 | loss scale: 2048.0 | grad norm: 5.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.846 | TFLOPs: 42.32 | +[default7]: iteration 6007/ 6200 | consumed samples: 6151168 | consumed tokens: 12597592064 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.606696E+00 | loss scale: 2048.0 | grad norm: 5.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.894 | TFLOPs: 42.34 | +[default7]: iteration 6008/ 6200 | consumed samples: 6152192 | consumed tokens: 12599689216 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.656415E+00 | loss scale: 2048.0 | grad norm: 5.706 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.875 | TFLOPs: 42.33 | +[default7]: iteration 6009/ 6200 | consumed samples: 6153216 | consumed tokens: 12601786368 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634380E+00 | loss scale: 2048.0 | grad norm: 5.959 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.909 | TFLOPs: 42.34 | +[default7]: iteration 6010/ 6200 | consumed samples: 6154240 | consumed tokens: 12603883520 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.599966E+00 | loss scale: 2048.0 | grad norm: 4.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.970 | TFLOPs: 42.36 | +[default7]: iteration 6011/ 6200 | consumed samples: 6155264 | consumed tokens: 12605980672 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.643712E+00 | loss scale: 2048.0 | grad norm: 4.684 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.800 | TFLOPs: 42.31 | +[default7]: iteration 6012/ 6200 | consumed samples: 6156288 | consumed tokens: 12608077824 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621318E+00 | loss scale: 2048.0 | grad norm: 5.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.565 | TFLOPs: 42.24 | +[default7]: iteration 6013/ 6200 | consumed samples: 6157312 | consumed tokens: 12610174976 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.654378E+00 | loss scale: 2048.0 | grad norm: 5.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.397 | TFLOPs: 42.18 | +[default7]: iteration 6014/ 6200 | consumed samples: 6158336 | consumed tokens: 12612272128 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615871E+00 | loss scale: 2048.0 | grad norm: 5.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.596 | TFLOPs: 42.25 | +[default7]: iteration 6015/ 6200 | consumed samples: 6159360 | consumed tokens: 12614369280 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.604646E+00 | loss scale: 2048.0 | grad norm: 5.754 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.629 | TFLOPs: 42.26 | +[default7]: iteration 6016/ 6200 | consumed samples: 6160384 | consumed tokens: 12616466432 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622726E+00 | loss scale: 2048.0 | grad norm: 6.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.479 | TFLOPs: 42.21 | +[default7]: iteration 6017/ 6200 | consumed samples: 6161408 | consumed tokens: 12618563584 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618780E+00 | loss scale: 2048.0 | grad norm: 5.788 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.525 | TFLOPs: 42.22 | +[default7]: iteration 6018/ 6200 | consumed samples: 6162432 | consumed tokens: 12620660736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641610E+00 | loss scale: 2048.0 | grad norm: 4.908 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.652 | TFLOPs: 42.26 | +[default7]: iteration 6019/ 6200 | consumed samples: 6163456 | consumed tokens: 12622757888 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628531E+00 | loss scale: 2048.0 | grad norm: 5.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.832 | TFLOPs: 42.32 | +[default7]: iteration 6020/ 6200 | consumed samples: 6164480 | consumed tokens: 12624855040 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.606532E+00 | loss scale: 2048.0 | grad norm: 6.057 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.497 | TFLOPs: 42.22 | +[default7]: iteration 6021/ 6200 | consumed samples: 6165504 | consumed tokens: 12626952192 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634313E+00 | loss scale: 2048.0 | grad norm: 5.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.568 | TFLOPs: 42.24 | +[default7]: iteration 6022/ 6200 | consumed samples: 6166528 | consumed tokens: 12629049344 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630454E+00 | loss scale: 2048.0 | grad norm: 4.971 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.522 | TFLOPs: 42.22 | +[default7]: iteration 6023/ 6200 | consumed samples: 6167552 | consumed tokens: 12631146496 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.620335E+00 | loss scale: 2048.0 | grad norm: 5.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.752 | TFLOPs: 42.29 | +[default7]: iteration 6024/ 6200 | consumed samples: 6168576 | consumed tokens: 12633243648 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622373E+00 | loss scale: 2048.0 | grad norm: 4.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.603 | TFLOPs: 42.25 | +[default7]: iteration 6025/ 6200 | consumed samples: 6169600 | consumed tokens: 12635340800 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626566E+00 | loss scale: 2048.0 | grad norm: 5.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.506 | TFLOPs: 42.22 | +[default7]: iteration 6026/ 6200 | consumed samples: 6170624 | consumed tokens: 12637437952 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624032E+00 | loss scale: 2048.0 | grad norm: 5.047 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.797 | TFLOPs: 42.31 | +[default7]: iteration 6027/ 6200 | consumed samples: 6171648 | consumed tokens: 12639535104 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611747E+00 | loss scale: 2048.0 | grad norm: 5.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.655 | TFLOPs: 42.26 | +[default7]: iteration 6028/ 6200 | consumed samples: 6172672 | consumed tokens: 12641632256 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623025E+00 | loss scale: 2048.0 | grad norm: 5.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.763 | TFLOPs: 42.30 | +[default7]: iteration 6029/ 6200 | consumed samples: 6173696 | consumed tokens: 12643729408 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631212E+00 | loss scale: 2048.0 | grad norm: 5.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.669 | TFLOPs: 42.27 | +[default7]: iteration 6030/ 6200 | consumed samples: 6174720 | consumed tokens: 12645826560 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629187E+00 | loss scale: 2048.0 | grad norm: 5.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.523 | TFLOPs: 42.22 | +[default7]: iteration 6031/ 6200 | consumed samples: 6175744 | consumed tokens: 12647923712 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621842E+00 | loss scale: 2048.0 | grad norm: 5.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.593 | TFLOPs: 42.24 | +[default7]: iteration 6032/ 6200 | consumed samples: 6176768 | consumed tokens: 12650020864 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638138E+00 | loss scale: 2048.0 | grad norm: 6.935 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.698 | TFLOPs: 42.28 | +[default7]: iteration 6033/ 6200 | consumed samples: 6177792 | consumed tokens: 12652118016 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629685E+00 | loss scale: 2048.0 | grad norm: 5.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.700 | TFLOPs: 42.28 | +[default7]: iteration 6034/ 6200 | consumed samples: 6178816 | consumed tokens: 12654215168 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622974E+00 | loss scale: 2048.0 | grad norm: 5.874 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.746 | TFLOPs: 42.29 | +[default7]: iteration 6035/ 6200 | consumed samples: 6179840 | consumed tokens: 12656312320 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609464E+00 | loss scale: 2048.0 | grad norm: 5.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.657 | TFLOPs: 42.26 | +[default7]: iteration 6036/ 6200 | consumed samples: 6180864 | consumed tokens: 12658409472 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621465E+00 | loss scale: 2048.0 | grad norm: 5.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.609 | TFLOPs: 42.25 | +[default7]: iteration 6037/ 6200 | consumed samples: 6181888 | consumed tokens: 12660506624 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.627819E+00 | loss scale: 2048.0 | grad norm: 6.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.870 | TFLOPs: 42.33 | +[default7]: iteration 6038/ 6200 | consumed samples: 6182912 | consumed tokens: 12662603776 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611841E+00 | loss scale: 2048.0 | grad norm: 5.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.755 | TFLOPs: 42.29 | +[default7]: iteration 6039/ 6200 | consumed samples: 6183936 | consumed tokens: 12664700928 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615760E+00 | loss scale: 2048.0 | grad norm: 4.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.048 | TFLOPs: 42.38 | +[default7]: iteration 6040/ 6200 | consumed samples: 6184960 | consumed tokens: 12666798080 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624757E+00 | loss scale: 2048.0 | grad norm: 5.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.817 | TFLOPs: 42.31 | +[default7]: iteration 6041/ 6200 | consumed samples: 6185984 | consumed tokens: 12668895232 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626413E+00 | loss scale: 2048.0 | grad norm: 5.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.932 | TFLOPs: 42.35 | +[default7]: iteration 6042/ 6200 | consumed samples: 6187008 | consumed tokens: 12670992384 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615866E+00 | loss scale: 2048.0 | grad norm: 4.805 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.720 | TFLOPs: 42.28 | +[default7]: iteration 6043/ 6200 | consumed samples: 6188032 | consumed tokens: 12673089536 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619170E+00 | loss scale: 2048.0 | grad norm: 5.747 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.683 | TFLOPs: 42.27 | +[default7]: iteration 6044/ 6200 | consumed samples: 6189056 | consumed tokens: 12675186688 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.608219E+00 | loss scale: 2048.0 | grad norm: 5.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.959 | TFLOPs: 42.36 | +[default7]: iteration 6045/ 6200 | consumed samples: 6190080 | consumed tokens: 12677283840 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.616922E+00 | loss scale: 2048.0 | grad norm: 5.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.886 | TFLOPs: 42.33 | +[default7]: iteration 6046/ 6200 | consumed samples: 6191104 | consumed tokens: 12679380992 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624698E+00 | loss scale: 2048.0 | grad norm: 5.027 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.698 | TFLOPs: 42.28 | +[default7]: iteration 6047/ 6200 | consumed samples: 6192128 | consumed tokens: 12681478144 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618831E+00 | loss scale: 2048.0 | grad norm: 4.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.924 | TFLOPs: 42.35 | +[default7]: iteration 6048/ 6200 | consumed samples: 6193152 | consumed tokens: 12683575296 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.585208E+00 | loss scale: 2048.0 | grad norm: 5.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.816 | TFLOPs: 42.31 | +[default7]: iteration 6049/ 6200 | consumed samples: 6194176 | consumed tokens: 12685672448 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.603945E+00 | loss scale: 2048.0 | grad norm: 4.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.951 | TFLOPs: 42.35 | +[default7]: iteration 6050/ 6200 | consumed samples: 6195200 | consumed tokens: 12687769600 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.584159E+00 | loss scale: 2048.0 | grad norm: 4.995 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.814 | TFLOPs: 42.31 | +[default7]: iteration 6051/ 6200 | consumed samples: 6196224 | consumed tokens: 12689866752 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.602476E+00 | loss scale: 2048.0 | grad norm: 5.045 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.778 | TFLOPs: 42.30 | +[default7]: iteration 6052/ 6200 | consumed samples: 6197248 | consumed tokens: 12691963904 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615556E+00 | loss scale: 2048.0 | grad norm: 5.019 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.871 | TFLOPs: 42.33 | +[default7]: iteration 6053/ 6200 | consumed samples: 6198272 | consumed tokens: 12694061056 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612629E+00 | loss scale: 2048.0 | grad norm: 5.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.903 | TFLOPs: 42.34 | +[default7]: iteration 6054/ 6200 | consumed samples: 6199296 | consumed tokens: 12696158208 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639847E+00 | loss scale: 2048.0 | grad norm: 6.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.647 | TFLOPs: 42.26 | +[default7]: iteration 6055/ 6200 | consumed samples: 6200320 | consumed tokens: 12698255360 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618488E+00 | loss scale: 2048.0 | grad norm: 6.555 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.819 | TFLOPs: 42.31 | +[default7]: iteration 6056/ 6200 | consumed samples: 6201344 | consumed tokens: 12700352512 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.651021E+00 | loss scale: 2048.0 | grad norm: 5.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.769 | TFLOPs: 42.30 | +[default7]: iteration 6057/ 6200 | consumed samples: 6202368 | consumed tokens: 12702449664 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609051E+00 | loss scale: 2048.0 | grad norm: 5.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.835 | TFLOPs: 42.32 | +[default7]: iteration 6058/ 6200 | consumed samples: 6203392 | consumed tokens: 12704546816 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630210E+00 | loss scale: 2048.0 | grad norm: 5.034 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.703 | TFLOPs: 42.28 | +[default7]: iteration 6059/ 6200 | consumed samples: 6204416 | consumed tokens: 12706643968 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628456E+00 | loss scale: 2048.0 | grad norm: 5.848 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.934 | TFLOPs: 42.04 | +[default7]: iteration 6060/ 6200 | consumed samples: 6205440 | consumed tokens: 12708741120 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612962E+00 | loss scale: 2048.0 | grad norm: 5.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.825 | TFLOPs: 42.31 | +[default7]: iteration 6061/ 6200 | consumed samples: 6206464 | consumed tokens: 12710838272 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621583E+00 | loss scale: 2048.0 | grad norm: 5.775 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.950 | TFLOPs: 42.35 | +[default7]: iteration 6062/ 6200 | consumed samples: 6207488 | consumed tokens: 12712935424 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.596343E+00 | loss scale: 2048.0 | grad norm: 5.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.025 | TFLOPs: 42.38 | +[default7]: iteration 6063/ 6200 | consumed samples: 6208512 | consumed tokens: 12715032576 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644294E+00 | loss scale: 2048.0 | grad norm: 5.612 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.936 | TFLOPs: 42.35 | +[default7]: iteration 6064/ 6200 | consumed samples: 6209536 | consumed tokens: 12717129728 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612810E+00 | loss scale: 2048.0 | grad norm: 5.971 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.950 | TFLOPs: 42.35 | +[default7]: iteration 6065/ 6200 | consumed samples: 6210560 | consumed tokens: 12719226880 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.641367E+00 | loss scale: 2048.0 | grad norm: 5.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.815 | TFLOPs: 42.31 | +[default7]: iteration 6066/ 6200 | consumed samples: 6211584 | consumed tokens: 12721324032 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624262E+00 | loss scale: 2048.0 | grad norm: 5.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.920 | TFLOPs: 42.34 | +[default7]: iteration 6067/ 6200 | consumed samples: 6212608 | consumed tokens: 12723421184 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.595594E+00 | loss scale: 2048.0 | grad norm: 5.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.762 | TFLOPs: 42.30 | +[default7]: iteration 6068/ 6200 | consumed samples: 6213632 | consumed tokens: 12725518336 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.599385E+00 | loss scale: 2048.0 | grad norm: 5.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.853 | TFLOPs: 42.32 | +[default7]: iteration 6069/ 6200 | consumed samples: 6214656 | consumed tokens: 12727615488 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618856E+00 | loss scale: 2048.0 | grad norm: 4.936 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.783 | TFLOPs: 42.30 | +[default7]: iteration 6070/ 6200 | consumed samples: 6215680 | consumed tokens: 12729712640 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611109E+00 | loss scale: 2048.0 | grad norm: 5.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.923 | TFLOPs: 42.34 | +[default7]: iteration 6071/ 6200 | consumed samples: 6216704 | consumed tokens: 12731809792 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.598391E+00 | loss scale: 2048.0 | grad norm: 5.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.996 | TFLOPs: 42.37 | +[default7]: iteration 6072/ 6200 | consumed samples: 6217728 | consumed tokens: 12733906944 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638002E+00 | loss scale: 2048.0 | grad norm: 5.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.031 | TFLOPs: 42.38 | +[default7]: iteration 6073/ 6200 | consumed samples: 6218752 | consumed tokens: 12736004096 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621835E+00 | loss scale: 2048.0 | grad norm: 5.036 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.987 | TFLOPs: 42.36 | +[default7]: iteration 6074/ 6200 | consumed samples: 6219776 | consumed tokens: 12738101248 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.631547E+00 | loss scale: 2048.0 | grad norm: 5.834 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.062 | TFLOPs: 42.39 | +[default7]: iteration 6075/ 6200 | consumed samples: 6220800 | consumed tokens: 12740198400 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632667E+00 | loss scale: 2048.0 | grad norm: 6.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.693 | TFLOPs: 42.27 | +[default7]: iteration 6076/ 6200 | consumed samples: 6221824 | consumed tokens: 12742295552 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630358E+00 | loss scale: 2048.0 | grad norm: 7.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.811 | TFLOPs: 42.31 | +[default7]: iteration 6077/ 6200 | consumed samples: 6222848 | consumed tokens: 12744392704 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.608730E+00 | loss scale: 2048.0 | grad norm: 5.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.696 | TFLOPs: 42.28 | +[default7]: iteration 6078/ 6200 | consumed samples: 6223872 | consumed tokens: 12746489856 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609159E+00 | loss scale: 2048.0 | grad norm: 6.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.563 | TFLOPs: 42.24 | +[default7]: iteration 6079/ 6200 | consumed samples: 6224896 | consumed tokens: 12748587008 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610347E+00 | loss scale: 2048.0 | grad norm: 6.816 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.649 | TFLOPs: 42.26 | +[default7]: iteration 6080/ 6200 | consumed samples: 6225920 | consumed tokens: 12750684160 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.605410E+00 | loss scale: 2048.0 | grad norm: 5.774 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.680 | TFLOPs: 42.27 | +[default7]: iteration 6081/ 6200 | consumed samples: 6226944 | consumed tokens: 12752781312 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640064E+00 | loss scale: 2048.0 | grad norm: 5.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.613 | TFLOPs: 42.25 | +[default7]: iteration 6082/ 6200 | consumed samples: 6227968 | consumed tokens: 12754878464 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.591755E+00 | loss scale: 2048.0 | grad norm: 5.840 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.541 | TFLOPs: 42.23 | +[default7]: iteration 6083/ 6200 | consumed samples: 6228992 | consumed tokens: 12756975616 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621729E+00 | loss scale: 2048.0 | grad norm: 6.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.688 | TFLOPs: 42.27 | +[default7]: iteration 6084/ 6200 | consumed samples: 6230016 | consumed tokens: 12759072768 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625711E+00 | loss scale: 2048.0 | grad norm: 6.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.558 | TFLOPs: 42.23 | +[default7]: iteration 6085/ 6200 | consumed samples: 6231040 | consumed tokens: 12761169920 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.644144E+00 | loss scale: 2048.0 | grad norm: 4.904 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.002 | TFLOPs: 42.37 | +[default7]: iteration 6086/ 6200 | consumed samples: 6232064 | consumed tokens: 12763267072 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636718E+00 | loss scale: 2048.0 | grad norm: 4.887 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.562 | TFLOPs: 42.23 | +[default7]: iteration 6087/ 6200 | consumed samples: 6233088 | consumed tokens: 12765364224 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609296E+00 | loss scale: 2048.0 | grad norm: 4.811 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.671 | TFLOPs: 42.27 | +[default7]: iteration 6088/ 6200 | consumed samples: 6234112 | consumed tokens: 12767461376 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.606468E+00 | loss scale: 2048.0 | grad norm: 5.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.587 | TFLOPs: 42.24 | +[default7]: iteration 6089/ 6200 | consumed samples: 6235136 | consumed tokens: 12769558528 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.608649E+00 | loss scale: 2048.0 | grad norm: 4.859 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.773 | TFLOPs: 42.30 | +[default7]: iteration 6090/ 6200 | consumed samples: 6236160 | consumed tokens: 12771655680 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.608711E+00 | loss scale: 2048.0 | grad norm: 5.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.784 | TFLOPs: 42.30 | +[default7]: iteration 6091/ 6200 | consumed samples: 6237184 | consumed tokens: 12773752832 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.608548E+00 | loss scale: 2048.0 | grad norm: 4.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.756 | TFLOPs: 42.29 | +[default7]: iteration 6092/ 6200 | consumed samples: 6238208 | consumed tokens: 12775849984 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.581791E+00 | loss scale: 2048.0 | grad norm: 5.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.762 | TFLOPs: 42.30 | +[default7]: iteration 6093/ 6200 | consumed samples: 6239232 | consumed tokens: 12777947136 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.598102E+00 | loss scale: 2048.0 | grad norm: 5.783 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.491 | TFLOPs: 42.21 | +[default7]: iteration 6094/ 6200 | consumed samples: 6240256 | consumed tokens: 12780044288 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612807E+00 | loss scale: 2048.0 | grad norm: 4.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.284 | TFLOPs: 42.15 | +[default7]: iteration 6095/ 6200 | consumed samples: 6241280 | consumed tokens: 12782141440 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611971E+00 | loss scale: 2048.0 | grad norm: 5.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.336 | TFLOPs: 42.17 | +[default7]: iteration 6096/ 6200 | consumed samples: 6242304 | consumed tokens: 12784238592 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618787E+00 | loss scale: 2048.0 | grad norm: 5.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.499 | TFLOPs: 42.22 | +[default7]: iteration 6097/ 6200 | consumed samples: 6243328 | consumed tokens: 12786335744 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.630426E+00 | loss scale: 2048.0 | grad norm: 4.777 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.487 | TFLOPs: 42.21 | +[default7]: iteration 6098/ 6200 | consumed samples: 6244352 | consumed tokens: 12788432896 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.616674E+00 | loss scale: 2048.0 | grad norm: 5.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.648 | TFLOPs: 41.96 | +[default7]: iteration 6099/ 6200 | consumed samples: 6245376 | consumed tokens: 12790530048 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.659353E+00 | loss scale: 2048.0 | grad norm: 5.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.433 | TFLOPs: 42.20 | +[default7]: iteration 6100/ 6200 | consumed samples: 6246400 | consumed tokens: 12792627200 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625221E+00 | loss scale: 2048.0 | grad norm: 4.682 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.519 | TFLOPs: 42.22 | +[default7]: iteration 6101/ 6200 | consumed samples: 6247424 | consumed tokens: 12794724352 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614806E+00 | loss scale: 2048.0 | grad norm: 5.103 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.499 | TFLOPs: 42.22 | +[default7]: iteration 6102/ 6200 | consumed samples: 6248448 | consumed tokens: 12796821504 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.598881E+00 | loss scale: 2048.0 | grad norm: 5.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.736 | TFLOPs: 42.29 | +[default7]: iteration 6103/ 6200 | consumed samples: 6249472 | consumed tokens: 12798918656 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629666E+00 | loss scale: 2048.0 | grad norm: 5.679 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.750 | TFLOPs: 42.29 | +[default7]: iteration 6104/ 6200 | consumed samples: 6250496 | consumed tokens: 12801015808 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.593489E+00 | loss scale: 2048.0 | grad norm: 5.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.502 | TFLOPs: 42.22 | +[default7]: iteration 6105/ 6200 | consumed samples: 6251520 | consumed tokens: 12803112960 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625819E+00 | loss scale: 2048.0 | grad norm: 5.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.479 | TFLOPs: 42.21 | +[default7]: iteration 6106/ 6200 | consumed samples: 6252544 | consumed tokens: 12805210112 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610766E+00 | loss scale: 2048.0 | grad norm: 6.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.314 | TFLOPs: 42.16 | +[default7]: iteration 6107/ 6200 | consumed samples: 6253568 | consumed tokens: 12807307264 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.587205E+00 | loss scale: 2048.0 | grad norm: 5.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.563 | TFLOPs: 42.24 | +[default7]: iteration 6108/ 6200 | consumed samples: 6254592 | consumed tokens: 12809404416 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614077E+00 | loss scale: 2048.0 | grad norm: 4.700 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.825 | TFLOPs: 42.31 | +[default7]: iteration 6109/ 6200 | consumed samples: 6255616 | consumed tokens: 12811501568 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625697E+00 | loss scale: 2048.0 | grad norm: 5.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.531 | TFLOPs: 42.23 | +[default7]: iteration 6110/ 6200 | consumed samples: 6256640 | consumed tokens: 12813598720 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.606049E+00 | loss scale: 2048.0 | grad norm: 6.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.810 | TFLOPs: 42.31 | +[default7]: iteration 6111/ 6200 | consumed samples: 6257664 | consumed tokens: 12815695872 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617717E+00 | loss scale: 2048.0 | grad norm: 6.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.926 | TFLOPs: 42.35 | +[default7]: iteration 6112/ 6200 | consumed samples: 6258688 | consumed tokens: 12817793024 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622765E+00 | loss scale: 2048.0 | grad norm: 5.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.046 | TFLOPs: 42.38 | +[default7]: iteration 6113/ 6200 | consumed samples: 6259712 | consumed tokens: 12819890176 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638471E+00 | loss scale: 2048.0 | grad norm: 7.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.981 | TFLOPs: 42.36 | +[default7]: iteration 6114/ 6200 | consumed samples: 6260736 | consumed tokens: 12821987328 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625555E+00 | loss scale: 2048.0 | grad norm: 5.753 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.859 | TFLOPs: 42.33 | +[default7]: iteration 6115/ 6200 | consumed samples: 6261760 | consumed tokens: 12824084480 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615808E+00 | loss scale: 2048.0 | grad norm: 4.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.812 | TFLOPs: 42.31 | +[default7]: iteration 6116/ 6200 | consumed samples: 6262784 | consumed tokens: 12826181632 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617009E+00 | loss scale: 2048.0 | grad norm: 6.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.025 | TFLOPs: 42.38 | +[default7]: iteration 6117/ 6200 | consumed samples: 6263808 | consumed tokens: 12828278784 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610862E+00 | loss scale: 2048.0 | grad norm: 5.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.064 | TFLOPs: 42.39 | +[default7]: iteration 6118/ 6200 | consumed samples: 6264832 | consumed tokens: 12830375936 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.638960E+00 | loss scale: 2048.0 | grad norm: 5.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.815 | TFLOPs: 42.31 | +[default7]: iteration 6119/ 6200 | consumed samples: 6265856 | consumed tokens: 12832473088 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615895E+00 | loss scale: 2048.0 | grad norm: 5.074 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.571 | TFLOPs: 42.24 | +[default7]: iteration 6120/ 6200 | consumed samples: 6266880 | consumed tokens: 12834570240 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.605160E+00 | loss scale: 2048.0 | grad norm: 7.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.819 | TFLOPs: 42.31 | +[default7]: iteration 6121/ 6200 | consumed samples: 6267904 | consumed tokens: 12836667392 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625234E+00 | loss scale: 2048.0 | grad norm: 5.784 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.889 | TFLOPs: 42.33 | +[default7]: iteration 6122/ 6200 | consumed samples: 6268928 | consumed tokens: 12838764544 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628852E+00 | loss scale: 2048.0 | grad norm: 4.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.796 | TFLOPs: 42.31 | +[default7]: iteration 6123/ 6200 | consumed samples: 6269952 | consumed tokens: 12840861696 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625517E+00 | loss scale: 2048.0 | grad norm: 4.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.836 | TFLOPs: 42.32 | +[default7]: iteration 6124/ 6200 | consumed samples: 6270976 | consumed tokens: 12842958848 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610842E+00 | loss scale: 2048.0 | grad norm: 4.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.858 | TFLOPs: 42.32 | +[default7]: iteration 6125/ 6200 | consumed samples: 6272000 | consumed tokens: 12845056000 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.632185E+00 | loss scale: 2048.0 | grad norm: 5.851 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.037 | TFLOPs: 42.38 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at iteration 6125 | lm loss value: 3.710687E+00 | lm loss PPL: 4.088190E+01 | +[default7]:----------------------------------------------------------------------------------------------------------- +[default7]:----------------------------------------------------------------------------------------------- +[default7]:validation loss at iteration 6125 | lm loss value: 1.509905E+00 | lm loss PPL: 4.526300E+00 | +[default7]:----------------------------------------------------------------------------------------------- +[default7]: iteration 6126/ 6200 | consumed samples: 6273024 | consumed tokens: 12847153152 | elapsed time per iteration (s): 51.73 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.599124E+00 | loss scale: 2048.0 | grad norm: 6.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 19.796 | TFLOPs: 6.03 | +[default7]: iteration 6127/ 6200 | consumed samples: 6274048 | consumed tokens: 12849250304 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.601612E+00 | loss scale: 2048.0 | grad norm: 6.023 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.228 | TFLOPs: 42.13 | +[default7]: iteration 6128/ 6200 | consumed samples: 6275072 | consumed tokens: 12851347456 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.640659E+00 | loss scale: 2048.0 | grad norm: 5.007 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.740 | TFLOPs: 42.29 | +[default7]: iteration 6129/ 6200 | consumed samples: 6276096 | consumed tokens: 12853444608 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617585E+00 | loss scale: 2048.0 | grad norm: 5.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.594 | TFLOPs: 42.24 | +[default7]: iteration 6130/ 6200 | consumed samples: 6277120 | consumed tokens: 12855541760 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625281E+00 | loss scale: 2048.0 | grad norm: 4.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.468 | TFLOPs: 42.21 | +[default7]: iteration 6131/ 6200 | consumed samples: 6278144 | consumed tokens: 12857638912 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618295E+00 | loss scale: 2048.0 | grad norm: 6.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.961 | TFLOPs: 42.05 | +[default7]: iteration 6132/ 6200 | consumed samples: 6279168 | consumed tokens: 12859736064 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.603250E+00 | loss scale: 2048.0 | grad norm: 5.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.781 | TFLOPs: 42.00 | +[default7]: iteration 6133/ 6200 | consumed samples: 6280192 | consumed tokens: 12861833216 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.625816E+00 | loss scale: 2048.0 | grad norm: 5.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.962 | TFLOPs: 42.36 | +[default7]: iteration 6134/ 6200 | consumed samples: 6281216 | consumed tokens: 12863930368 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614673E+00 | loss scale: 2048.0 | grad norm: 4.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.783 | TFLOPs: 42.30 | +[default7]: iteration 6135/ 6200 | consumed samples: 6282240 | consumed tokens: 12866027520 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611904E+00 | loss scale: 2048.0 | grad norm: 5.640 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.600 | TFLOPs: 42.25 | +[default7]: iteration 6136/ 6200 | consumed samples: 6283264 | consumed tokens: 12868124672 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.588009E+00 | loss scale: 2048.0 | grad norm: 5.886 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.574 | TFLOPs: 41.93 | +[default7]: iteration 6137/ 6200 | consumed samples: 6284288 | consumed tokens: 12870221824 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629982E+00 | loss scale: 2048.0 | grad norm: 4.675 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.663 | TFLOPs: 42.27 | +[default7]: iteration 6138/ 6200 | consumed samples: 6285312 | consumed tokens: 12872318976 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.608657E+00 | loss scale: 2048.0 | grad norm: 4.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.814 | TFLOPs: 42.31 | +[default7]: iteration 6139/ 6200 | consumed samples: 6286336 | consumed tokens: 12874416128 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626898E+00 | loss scale: 2048.0 | grad norm: 5.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.630 | TFLOPs: 42.26 | +[default7]: iteration 6140/ 6200 | consumed samples: 6287360 | consumed tokens: 12876513280 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611302E+00 | loss scale: 2048.0 | grad norm: 5.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.743 | TFLOPs: 42.29 | +[default7]: iteration 6141/ 6200 | consumed samples: 6288384 | consumed tokens: 12878610432 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611839E+00 | loss scale: 2048.0 | grad norm: 4.934 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.717 | TFLOPs: 42.28 | +[default7]: iteration 6142/ 6200 | consumed samples: 6289408 | consumed tokens: 12880707584 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.636109E+00 | loss scale: 2048.0 | grad norm: 4.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.622 | TFLOPs: 42.25 | +[default7]: iteration 6143/ 6200 | consumed samples: 6290432 | consumed tokens: 12882804736 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.604525E+00 | loss scale: 2048.0 | grad norm: 5.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.613 | TFLOPs: 42.25 | +[default7]: iteration 6144/ 6200 | consumed samples: 6291456 | consumed tokens: 12884901888 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.613875E+00 | loss scale: 2048.0 | grad norm: 5.769 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.016 | TFLOPs: 42.37 | +[default7]: iteration 6145/ 6200 | consumed samples: 6292480 | consumed tokens: 12886999040 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.595796E+00 | loss scale: 2048.0 | grad norm: 5.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.842 | TFLOPs: 42.32 | +[default7]: iteration 6146/ 6200 | consumed samples: 6293504 | consumed tokens: 12889096192 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.591596E+00 | loss scale: 2048.0 | grad norm: 5.903 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.777 | TFLOPs: 42.30 | +[default7]: iteration 6147/ 6200 | consumed samples: 6294528 | consumed tokens: 12891193344 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.583005E+00 | loss scale: 2048.0 | grad norm: 5.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.987 | TFLOPs: 42.36 | +[default7]: iteration 6148/ 6200 | consumed samples: 6295552 | consumed tokens: 12893290496 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.613650E+00 | loss scale: 2048.0 | grad norm: 5.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.993 | TFLOPs: 42.37 | +[default7]: iteration 6149/ 6200 | consumed samples: 6296576 | consumed tokens: 12895387648 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.628461E+00 | loss scale: 2048.0 | grad norm: 6.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.832 | TFLOPs: 42.32 | +[default7]: iteration 6150/ 6200 | consumed samples: 6297600 | consumed tokens: 12897484800 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.594253E+00 | loss scale: 2048.0 | grad norm: 5.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.645 | TFLOPs: 42.26 | +[default7]: iteration 6151/ 6200 | consumed samples: 6298624 | consumed tokens: 12899581952 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615413E+00 | loss scale: 2048.0 | grad norm: 5.845 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.950 | TFLOPs: 42.35 | +[default7]: iteration 6152/ 6200 | consumed samples: 6299648 | consumed tokens: 12901679104 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.650608E+00 | loss scale: 2048.0 | grad norm: 5.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.699 | TFLOPs: 42.28 | +[default7]: iteration 6153/ 6200 | consumed samples: 6300672 | consumed tokens: 12903776256 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.603939E+00 | loss scale: 2048.0 | grad norm: 5.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.780 | TFLOPs: 42.30 | +[default7]: iteration 6154/ 6200 | consumed samples: 6301696 | consumed tokens: 12905873408 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610998E+00 | loss scale: 2048.0 | grad norm: 5.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.877 | TFLOPs: 42.33 | +[default7]: iteration 6155/ 6200 | consumed samples: 6302720 | consumed tokens: 12907970560 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.613263E+00 | loss scale: 2048.0 | grad norm: 5.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.907 | TFLOPs: 42.34 | +[default7]: iteration 6156/ 6200 | consumed samples: 6303744 | consumed tokens: 12910067712 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.600324E+00 | loss scale: 2048.0 | grad norm: 5.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.929 | TFLOPs: 42.35 | +[default7]: iteration 6157/ 6200 | consumed samples: 6304768 | consumed tokens: 12912164864 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.611881E+00 | loss scale: 2048.0 | grad norm: 5.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.066 | TFLOPs: 42.39 | +[default7]: iteration 6158/ 6200 | consumed samples: 6305792 | consumed tokens: 12914262016 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.605280E+00 | loss scale: 2048.0 | grad norm: 5.002 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.913 | TFLOPs: 42.34 | +[default7]: iteration 6159/ 6200 | consumed samples: 6306816 | consumed tokens: 12916359168 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.639672E+00 | loss scale: 2048.0 | grad norm: 4.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.726 | TFLOPs: 42.28 | +[default7]: iteration 6160/ 6200 | consumed samples: 6307840 | consumed tokens: 12918456320 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.633164E+00 | loss scale: 2048.0 | grad norm: 5.045 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.989 | TFLOPs: 42.37 | +[default7]: iteration 6161/ 6200 | consumed samples: 6308864 | consumed tokens: 12920553472 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622796E+00 | loss scale: 2048.0 | grad norm: 5.542 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.795 | TFLOPs: 42.00 | +[default7]: iteration 6162/ 6200 | consumed samples: 6309888 | consumed tokens: 12922650624 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634514E+00 | loss scale: 2048.0 | grad norm: 5.045 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.803 | TFLOPs: 42.31 | +[default7]: iteration 6163/ 6200 | consumed samples: 6310912 | consumed tokens: 12924747776 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615121E+00 | loss scale: 2048.0 | grad norm: 4.877 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.091 | TFLOPs: 42.40 | +[default7]: iteration 6164/ 6200 | consumed samples: 6311936 | consumed tokens: 12926844928 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.604167E+00 | loss scale: 2048.0 | grad norm: 5.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.939 | TFLOPs: 42.35 | +[default7]: iteration 6165/ 6200 | consumed samples: 6312960 | consumed tokens: 12928942080 | elapsed time per iteration (s): 7.43 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.597416E+00 | loss scale: 2048.0 | grad norm: 5.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.759 | TFLOPs: 41.99 | +[default7]: iteration 6166/ 6200 | consumed samples: 6313984 | consumed tokens: 12931039232 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618932E+00 | loss scale: 2048.0 | grad norm: 4.995 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.923 | TFLOPs: 42.34 | +[default7]: iteration 6167/ 6200 | consumed samples: 6315008 | consumed tokens: 12933136384 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622298E+00 | loss scale: 2048.0 | grad norm: 4.858 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.855 | TFLOPs: 42.32 | +[default7]: iteration 6168/ 6200 | consumed samples: 6316032 | consumed tokens: 12935233536 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.626299E+00 | loss scale: 2048.0 | grad norm: 4.637 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.749 | TFLOPs: 42.29 | +[default7]: iteration 6169/ 6200 | consumed samples: 6317056 | consumed tokens: 12937330688 | elapsed time per iteration (s): 7.45 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.575323E+00 | loss scale: 2048.0 | grad norm: 4.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.539 | TFLOPs: 41.92 | +[default7]: iteration 6170/ 6200 | consumed samples: 6318080 | consumed tokens: 12939427840 | elapsed time per iteration (s): 7.44 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624797E+00 | loss scale: 2048.0 | grad norm: 4.927 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 137.727 | TFLOPs: 41.98 | +[default7]: iteration 6171/ 6200 | consumed samples: 6319104 | consumed tokens: 12941524992 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.609873E+00 | loss scale: 2048.0 | grad norm: 4.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.536 | TFLOPs: 42.23 | +[default7]: iteration 6172/ 6200 | consumed samples: 6320128 | consumed tokens: 12943622144 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.596564E+00 | loss scale: 2048.0 | grad norm: 4.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.528 | TFLOPs: 42.22 | +[default7]: iteration 6173/ 6200 | consumed samples: 6321152 | consumed tokens: 12945719296 | elapsed time per iteration (s): 7.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622026E+00 | loss scale: 2048.0 | grad norm: 5.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.031 | TFLOPs: 42.07 | +[default7]: iteration 6174/ 6200 | consumed samples: 6322176 | consumed tokens: 12947816448 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.634306E+00 | loss scale: 2048.0 | grad norm: 4.733 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.912 | TFLOPs: 42.34 | +[default7]: iteration 6175/ 6200 | consumed samples: 6323200 | consumed tokens: 12949913600 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617037E+00 | loss scale: 2048.0 | grad norm: 4.748 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.612 | TFLOPs: 42.25 | +[default7]: iteration 6176/ 6200 | consumed samples: 6324224 | consumed tokens: 12952010752 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.618139E+00 | loss scale: 2048.0 | grad norm: 6.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.563 | TFLOPs: 42.24 | +[default7]: iteration 6177/ 6200 | consumed samples: 6325248 | consumed tokens: 12954107904 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619846E+00 | loss scale: 2048.0 | grad norm: 5.692 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.608 | TFLOPs: 42.25 | +[default7]: iteration 6178/ 6200 | consumed samples: 6326272 | consumed tokens: 12956205056 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.619164E+00 | loss scale: 2048.0 | grad norm: 5.081 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.840 | TFLOPs: 42.32 | +[default7]: iteration 6179/ 6200 | consumed samples: 6327296 | consumed tokens: 12958302208 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.629375E+00 | loss scale: 2048.0 | grad norm: 4.761 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.010 | TFLOPs: 42.37 | +[default7]: iteration 6180/ 6200 | consumed samples: 6328320 | consumed tokens: 12960399360 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.594050E+00 | loss scale: 2048.0 | grad norm: 4.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.998 | TFLOPs: 42.37 | +[default7]: iteration 6181/ 6200 | consumed samples: 6329344 | consumed tokens: 12962496512 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.622276E+00 | loss scale: 2048.0 | grad norm: 5.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.720 | TFLOPs: 42.28 | +[default7]: iteration 6182/ 6200 | consumed samples: 6330368 | consumed tokens: 12964593664 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.599762E+00 | loss scale: 2048.0 | grad norm: 4.589 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.121 | TFLOPs: 42.41 | +[default7]: iteration 6183/ 6200 | consumed samples: 6331392 | consumed tokens: 12966690816 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.615390E+00 | loss scale: 2048.0 | grad norm: 5.825 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.903 | TFLOPs: 42.34 | +[default7]: iteration 6184/ 6200 | consumed samples: 6332416 | consumed tokens: 12968787968 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.623591E+00 | loss scale: 2048.0 | grad norm: 4.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.957 | TFLOPs: 42.36 | +[default7]: iteration 6185/ 6200 | consumed samples: 6333440 | consumed tokens: 12970885120 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.600487E+00 | loss scale: 2048.0 | grad norm: 5.032 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.969 | TFLOPs: 42.36 | +[default7]: iteration 6186/ 6200 | consumed samples: 6334464 | consumed tokens: 12972982272 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.595893E+00 | loss scale: 2048.0 | grad norm: 4.749 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.670 | TFLOPs: 42.27 | +[default7]: iteration 6187/ 6200 | consumed samples: 6335488 | consumed tokens: 12975079424 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.621407E+00 | loss scale: 2048.0 | grad norm: 5.858 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.685 | TFLOPs: 42.27 | +[default7]: iteration 6188/ 6200 | consumed samples: 6336512 | consumed tokens: 12977176576 | elapsed time per iteration (s): 7.39 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.605792E+00 | loss scale: 2048.0 | grad norm: 4.750 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.585 | TFLOPs: 42.24 | +[default7]: iteration 6189/ 6200 | consumed samples: 6337536 | consumed tokens: 12979273728 | elapsed time per iteration (s): 7.38 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.604658E+00 | loss scale: 2048.0 | grad norm: 5.859 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.713 | TFLOPs: 42.28 | +[default7]: iteration 6190/ 6200 | consumed samples: 6338560 | consumed tokens: 12981370880 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.617589E+00 | loss scale: 2048.0 | grad norm: 4.976 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.115 | TFLOPs: 42.40 | +[default7]: iteration 6191/ 6200 | consumed samples: 6339584 | consumed tokens: 12983468032 | elapsed time per iteration (s): 7.35 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.602819E+00 | loss scale: 2048.0 | grad norm: 5.823 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.241 | TFLOPs: 42.44 | +[default7]: iteration 6192/ 6200 | consumed samples: 6340608 | consumed tokens: 12985565184 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.589598E+00 | loss scale: 2048.0 | grad norm: 5.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.028 | TFLOPs: 42.38 | +[default7]: iteration 6193/ 6200 | consumed samples: 6341632 | consumed tokens: 12987662336 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.595149E+00 | loss scale: 2048.0 | grad norm: 4.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.194 | TFLOPs: 42.43 | +[default7]: iteration 6194/ 6200 | consumed samples: 6342656 | consumed tokens: 12989759488 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.624402E+00 | loss scale: 2048.0 | grad norm: 5.826 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.942 | TFLOPs: 42.35 | +[default7]: iteration 6195/ 6200 | consumed samples: 6343680 | consumed tokens: 12991856640 | elapsed time per iteration (s): 7.41 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.603181E+00 | loss scale: 2048.0 | grad norm: 5.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.100 | TFLOPs: 42.09 | +[default7]: iteration 6196/ 6200 | consumed samples: 6344704 | consumed tokens: 12993953792 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.614330E+00 | loss scale: 2048.0 | grad norm: 4.863 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.066 | TFLOPs: 42.39 | +[default7]: iteration 6197/ 6200 | consumed samples: 6345728 | consumed tokens: 12996050944 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.606325E+00 | loss scale: 2048.0 | grad norm: 5.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.195 | TFLOPs: 42.43 | +[default7]: iteration 6198/ 6200 | consumed samples: 6346752 | consumed tokens: 12998148096 | elapsed time per iteration (s): 7.36 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.612150E+00 | loss scale: 2048.0 | grad norm: 4.842 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 139.087 | TFLOPs: 42.39 | +[default7]: iteration 6199/ 6200 | consumed samples: 6347776 | consumed tokens: 13000245248 | elapsed time per iteration (s): 7.40 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.610840E+00 | loss scale: 2048.0 | grad norm: 5.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.347 | TFLOPs: 42.17 | +[default0]:[after training is done] datetime: 2022-10-07 02:03:29 +[default7]: iteration 6200/ 6200 | consumed samples: 6348800 | consumed tokens: 13002342400 | elapsed time per iteration (s): 7.37 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 1.613196E+00 | loss scale: 2048.0 | grad norm: 5.029 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.866 | TFLOPs: 42.33 | +[default7]:----------------------------------------------------------------------------------------------------------------------------- +[default7]:validation_pretraining loss at the end of training for val data | lm loss value: 3.724028E+00 | lm loss PPL: 4.143093E+01 | +[default7]:----------------------------------------------------------------------------------------------------------------------------- +[default0]:saving checkpoint at iteration 6200 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default0]:[2022-10-07 02:04:14,288] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step6200 is begin to save! +[default7]:----------------------------------------------------------------------------------------------------------------- +[default7]:validation loss at the end of training for val data | lm loss value: 1.502029E+00 | lm loss PPL: 4.490793E+00 | +[default7]:----------------------------------------------------------------------------------------------------------------- +[default0]:[2022-10-07 02:04:14,293] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_01-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:14,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_01-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:14,681] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_03-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:14,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_03-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:14,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_04-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:14,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_04-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:14,738] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_05-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:14,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_05-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:14,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_06-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:14,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_06-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:14,794] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_07-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:14,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_07-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:14,821] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_08-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:14,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_08-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:14,849] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_09-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:14,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_09-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:14,875] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_10-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:14,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_10-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:14,902] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_11-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:14,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_11-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:14,929] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_12-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:14,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_12-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:14,956] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_13-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:14,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_13-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:14,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_14-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:15,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_14-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_15-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:15,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_15-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,039] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_16-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:15,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_16-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,066] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_17-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:15,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_17-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,093] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_18-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:15,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_18-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,119] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_19-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:15,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_19-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_20-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:15,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_20-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_21-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:15,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_21-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_22-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:15,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_22-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,232] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_23-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:15,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_23-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,259] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_24-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:15,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_24-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,286] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_25-model_00-model_states.pt... +[default3]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 02:04:15,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_25-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,316] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_26-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:15,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_26-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,342] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_28-model_00-model_states.pt... +[default0]:[2022-10-07 02:04:15,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/layer_28-model_00-model_states.pt. +[default0]:[2022-10-07 02:04:15,344] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/mp_rank_00_model_states.pt +[default0]:[2022-10-07 02:04:15,344] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/mp_rank_00_model_states.pt... +[default0]:[2022-10-07 02:04:15,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/mp_rank_00_model_states.pt. +[default0]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default1]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default4]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default0]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default7]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default2]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default3]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default6]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 02:04:15,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default5]:[2022-10-07 02:04:15,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 02:04:15,573] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 02:04:15,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 02:04:15,536] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 02:04:15,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 02:04:15,550] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 02:04:15,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 02:04:15,562] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 02:04:15,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 02:04:15,637] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 02:04:15,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 02:04:15,636] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 02:04:15,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 02:04:15,640] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 02:04:15,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 02:04:15,645] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 02:04:15,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 02:04:15,639] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default3]:[2022-10-07 02:04:15,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default3]:[2022-10-07 02:04:15,666] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 02:04:15,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 02:04:15,660] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 02:04:15,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 02:04:15,667] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 02:04:15,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 02:04:15,636] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 02:04:15,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 02:04:15,681] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 02:04:15,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 02:04:15,683] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 02:04:15,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 02:04:15,681] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 02:04:15,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 02:04:15,662] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 02:04:15,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 02:04:15,701] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 02:04:15,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 02:04:15,662] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 02:04:15,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 02:04:15,716] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 02:04:15,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 02:04:15,664] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 02:04:15,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 02:04:15,686] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default2]:[2022-10-07 02:04:15,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default2]:[2022-10-07 02:04:15,668] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default7]:[2022-10-07 02:04:15,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default7]:[2022-10-07 02:04:15,663] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 02:04:15,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 02:04:15,682] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 02:04:15,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 02:04:15,696] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default6]:[2022-10-07 02:04:15,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default6]:[2022-10-07 02:04:15,661] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default6]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default5]:[2022-10-07 02:04:15,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default5]:[2022-10-07 02:04:15,733] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default5]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default3]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default3]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default3]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default2]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default1]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default5]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default4]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default0]:[2022-10-07 02:04:15,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2022-10-07 02:04:15,747] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default4]:[2022-10-07 02:04:15,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default4]:[2022-10-07 02:04:15,734] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default4]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default5]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default0]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default0]: successfully saved checkpoint at iteration 6200 to /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq +[default1]:[2022-10-07 02:04:15,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 02:04:15,695] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default7]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default4]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default5]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default6]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default1]:[2022-10-07 02:04:15,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default1]:[2022-10-07 02:04:15,748] [INFO] [engine.py:3188:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/six/commun/checkpoints/tr13e-350M-ml-t0/checkpoints/xp3capmixnewcodelonglossseq/global_step6200/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default1]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default6]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default1]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default2]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default7]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default4]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default7]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default2]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default0]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default7]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default0]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default2]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now! +[default3]:[2022-10-07 02:04:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6200 is ready now!